26 from fnmatch
import fnmatch
28 from contextlib
import contextmanager
30 from lsst.pex.config
import Config, Field, DictField, ListField, ConfigurableField
38 """Argument parser to support ingesting images into the image repository""" 41 super(IngestArgumentParser, self).
__init__(*args, **kwargs)
42 self.add_argument(
"-n",
"--dry-run", dest=
"dryrun", action=
"store_true", default=
False,
43 help=
"Don't perform any action?")
44 self.add_argument(
"--mode", choices=[
"move",
"copy",
"link",
"skip"], default=
"link",
45 help=
"Mode of delivering the files to their destination")
46 self.add_argument(
"--create", action=
"store_true", help=
"Create new registry (clobber old)?")
47 self.add_argument(
"--ignore-ingested", dest=
"ignoreIngested", action=
"store_true",
48 help=
"Don't register files that have already been registered")
49 self.add_id_argument(
"--badId",
"raw",
"Data identifier for bad data", doMakeDataRefList=
False)
50 self.add_argument(
"--badFile", nargs=
"*", default=[],
51 help=
"Names of bad files (no path; wildcards allowed)")
52 self.add_argument(
"files", nargs=
"+", help=
"Names of file")
56 """Configuration for ParseTask""" 57 translation = DictField(keytype=str, itemtype=str, default={},
58 doc=
"Translation table for property --> header")
59 translators = DictField(keytype=str, itemtype=str, default={},
60 doc=
"Properties and name of translator method")
61 defaults = DictField(keytype=str, itemtype=str, default={},
62 doc=
"Default values if header is not present")
63 hdu = Field(dtype=int, default=DEFAULT_HDU, doc=
"HDU to read for metadata")
64 extnames = ListField(dtype=str, default=[], doc=
"Extension names to search for")
68 """Task that will parse the filename and/or its contents to get the required information 69 for putting the file in the correct location and populating the registry.""" 70 ConfigClass = ParseConfig
73 """Get information about the image from the filename and its contents 75 Here, we open the image and parse the header, but one could also look at the filename itself 76 and derive information from that, or set values from the configuration. 78 @param filename Name of file to inspect 79 @return File properties; list of file properties for each extension 81 md = readMetadata(filename, self.config.hdu)
83 if len(self.config.extnames) == 0:
85 return phuInfo, [phuInfo]
87 extnames = set(self.config.extnames)
90 while len(extnames) > 0:
93 md = readMetadata(filename, extnum)
94 except Exception
as e:
95 self.log.warn(
"Error reading %s extensions %s: %s" % (filename, extnames, e))
101 hduInfo[
"hdu"] = extnum
102 infoList.append(hduInfo)
103 extnames.discard(ext)
104 return phuInfo, infoList
108 """ Get the name of an extension. 109 @param md: PropertySet like one obtained from lsst.afw.fits.readMetadata) 110 @return Name of the extension if it exists. None otherwise. 114 ext = md.getScalar(
"EXTNAME")
120 """Attempt to pull the desired information out of the header 122 This is done through two mechanisms: 123 * translation: a property is set directly from the relevant header keyword 124 * translator: a property is set with the result of calling a method 126 The translator methods receive the header metadata and should return the 127 appropriate value, or None if the value cannot be determined. 129 @param md FITS header 130 @param info File properties, to be supplemented 135 for p, h
in self.config.translation.items():
136 value = md.get(h,
None)
137 if value
is not None:
138 if isinstance(value, str):
139 value = value.strip()
141 elif p
in self.config.defaults:
142 info[p] = self.config.defaults[p]
144 self.log.warn(
"Unable to find value for %s (derived from %s)" % (p, h))
145 for p, t
in self.config.translators.items():
146 func = getattr(self, t)
149 except Exception
as e:
150 self.log.warn(
"%s failed to translate %s: %s", t, p, e)
152 if value
is not None:
157 """Convert a full DATE-OBS to a mere date 159 Besides being an example of a translator, this is also generally useful. 160 It will only be used if listed as a translator in the configuration. 162 date = md.getScalar(
"DATE-OBS").strip()
169 """Translate a full filter description into a mere filter name 171 Besides being an example of a translator, this is also generally useful. 172 It will only be used if listed as a translator in the configuration. 174 filterName = md.getScalar(
"FILTER").strip()
175 filterName = filterName.strip()
176 c = filterName.find(
" ")
178 filterName = filterName[:c]
182 """Get destination for the file 184 @param butler Data butler 185 @param info File properties, used as dataId for the butler 186 @param filename Input filename 187 @return Destination filename 189 raw = butler.get(
"raw_filename", info)[0]
198 """Configuration for the RegisterTask""" 199 table = Field(dtype=str, default=
"raw", doc=
"Name of table")
200 columns = DictField(keytype=str, itemtype=str, doc=
"List of columns for raw table, with their types",
201 itemCheck=
lambda x: x
in (
"text",
"int",
"double"),
202 default={
'object':
'text',
211 unique = ListField(dtype=str, doc=
"List of columns to be declared unique for the table",
212 default=[
"visit",
"ccd"])
213 visit = ListField(dtype=str, default=[
"visit",
"object",
"date",
"filter"],
214 doc=
"List of columns for raw_visit table")
215 ignore = Field(dtype=bool, default=
False, doc=
"Ignore duplicates in the table?")
216 permissions = Field(dtype=int, default=0o664, doc=
"Permissions mode for registry; 0o664 = rw-rw-r--")
220 """Context manager to provide a registry 223 def __init__(self, registryName, createTableFunc, forceCreateTables, permissions):
224 """Construct a context manager 226 @param registryName: Name of registry file 227 @param createTableFunc: Function to create tables 228 @param forceCreateTables: Force the (re-)creation of tables? 229 @param permissions: Permissions to set on database file 231 self.
conn = sqlite3.connect(registryName)
232 os.chmod(registryName, permissions)
233 createTableFunc(self.
conn, forceCreateTables=forceCreateTables)
236 """Provide the 'as' value""" 247 """A context manager that doesn't provide any context 249 Useful for dry runs where we don't want to actually do anything real. 254 class RegisterTask(Task):
255 """Task that will generate the registry for the Mapper""" 256 ConfigClass = RegisterConfig
258 typemap = {
'text': str,
'int': int,
'double': float}
260 def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3"):
261 """Open the registry and return the connection handle. 263 @param directory Directory in which the registry file will be placed 264 @param create Clobber any existing registry and create a new one? 265 @param dryrun Don't do anything permanent? 266 @param name Filename of the registry 267 @return Database connection 272 registryName = os.path.join(directory, name)
277 """Create the registry tables 279 One table (typically 'raw') contains information on all files, and the 280 other (typically 'raw_visit') contains information on all visits. 282 @param conn Database connection 283 @param table Name of table to create in database 285 cursor = conn.cursor()
287 table = self.config.table
288 cmd =
"SELECT name FROM sqlite_master WHERE type='table' AND name='%s'" % table
290 if cursor.fetchone()
and not forceCreateTables:
291 self.log.info(
'Table "%s" exists. Skipping creation' % table)
294 cmd =
"drop table if exists %s" % table
296 cmd =
"drop table if exists %s_visit" % table
299 cmd =
"create table %s (id integer primary key autoincrement, " % table
300 cmd +=
",".join([(
"%s %s" % (col, colType))
for col, colType
in self.config.columns.items()])
301 if len(self.config.unique) > 0:
302 cmd +=
", unique(" +
",".join(self.config.unique) +
")" 306 cmd =
"create table %s_visit (" % table
307 cmd +=
",".join([(
"%s %s" % (col, self.config.columns[col]))
for col
in self.config.visit])
308 cmd +=
", unique(" +
",".join(set(self.config.visit).intersection(set(self.config.unique))) +
")" 314 def check(self, conn, info, table=None):
315 """Check for the presence of a row already 317 Not sure this is required, given the 'ignore' configuration option. 320 table = self.config.table
321 if self.config.ignore
or len(self.config.unique) == 0:
323 cursor = conn.cursor()
324 sql =
"SELECT COUNT(*) FROM %s WHERE " % table
325 sql +=
" AND ".join([
"%s = %s" % (col, self.
placeHolder)
for col
in self.config.unique])
326 values = [self.
typemap[self.config.columns[col]](info[col])
for col
in self.config.unique]
328 cursor.execute(sql, values)
329 if cursor.fetchone()[0] > 0:
333 def addRow(self, conn, info, dryrun=False, create=False, table=None):
334 """Add a row to the file table (typically 'raw'). 336 @param conn Database connection 337 @param info File properties to add to database 338 @param table Name of table in database 342 table = self.config.table
344 if self.config.ignore:
345 ignoreClause =
" OR IGNORE" 346 sql =
"INSERT%s INTO %s (%s) VALUES (" % (ignoreClause, table,
",".join(self.config.columns))
347 sql +=
",".join([self.
placeHolder] * len(self.config.columns)) +
")" 348 values = [self.
typemap[tt](info[col])
for col, tt
in self.config.columns.items()]
351 print(
"Would execute: '%s' with %s" % (sql,
",".join([str(value)
for value
in values])))
353 conn.cursor().execute(sql, values)
355 sql =
"INSERT OR IGNORE INTO %s_visit VALUES (" % table
356 sql +=
",".join([self.
placeHolder] * len(self.config.visit)) +
")" 357 values = [self.
typemap[self.config.columns[col]](info[col])
for col
in self.config.visit]
360 print(
"Would execute: '%s' with %s" % (sql,
",".join([str(value)
for value
in values])))
362 conn.cursor().execute(sql, values)
366 """Configuration for IngestTask""" 367 parse = ConfigurableField(target=ParseTask, doc=
"File parsing")
368 register = ConfigurableField(target=RegisterTask, doc=
"Registry entry")
369 allowError = Field(dtype=bool, default=
False, doc=
"Allow error in ingestion?")
370 clobber = Field(dtype=bool, default=
False, doc=
"Clobber existing file?")
381 """Task that will ingest images into the data repository""" 382 ConfigClass = IngestConfig
383 ArgumentParser = IngestArgumentParser
384 _DefaultName =
"ingest" 387 super(IngestTask, self).
__init__(*args, **kwargs)
388 self.makeSubtask(
"parse")
389 self.makeSubtask(
"register")
393 """Parse the command-line arguments and return them along with a Task 397 args = parser.parse_args(config)
398 task = cls(config=args.config)
403 """Parse the command-line arguments and run the Task.""" 408 def prepareTask(cls, root=None, dryrun=False, mode="move", create=False,
409 ignoreIngested=False):
410 """Prepare for running the task repeatedly with `ingestFiles`. 412 Saves the parsed arguments, including the Butler and log, as a 413 private instance variable. 417 root : `str`, optional 418 Repository root pathname. If None, run the Task using the 419 command line arguments, ignoring all other arguments below. 420 dryrun : `bool`, optional 421 If True, don't perform any action; log what would have happened. 422 mode : `str`, optional 423 How files are delivered to their destination. Default is "move", 424 unlike the command-line default of "link". 425 create : `bool`, optional 426 If True, create a new registry, clobbering any old one present. 427 ignoreIngested : `bool`, optional 428 If True, do not complain if the file is already present in the 429 registry (and do nothing else). 434 If `root` was provided, the IngestTask instance 436 sys.argv = [
"IngestTask"]
437 sys.argv.append(root)
439 sys.argv.append(
"--dry-run")
440 sys.argv.append(
"--mode")
441 sys.argv.append(mode)
443 sys.argv.append(
"--create")
445 sys.argv.append(
"--ignore-ingested")
446 sys.argv.append(
"__fakefile__")
452 def ingest(self, infile, outfile, mode="move", dryrun=False):
453 """Ingest a file into the image repository. 455 @param infile Name of input file 456 @param outfile Name of output file (file in repository) 457 @param mode Mode of ingest (copy/link/move/skip) 458 @param dryrun Only report what would occur? 459 @param Success boolean 464 self.log.info(
"Would %s from %s to %s" % (mode, infile, outfile))
467 outdir = os.path.dirname(outfile)
468 if not os.path.isdir(outdir):
471 except OSError
as exc:
473 if not os.path.isdir(outdir):
474 raise RuntimeError(f
"Failed to create directory {outdir}")
from exc
475 if os.path.lexists(outfile):
476 if self.config.clobber:
479 raise RuntimeError(
"File %s already exists; consider --config clobber=True" % outfile)
483 shutil.copyfile(infile, outfile)
485 os.symlink(os.path.abspath(infile), outfile)
488 shutil.move(infile, outfile)
490 raise AssertionError(
"Unknown mode: %s" % mode)
491 self.log.info(
"%s --<%s>--> %s" % (infile, mode, outfile))
492 except Exception
as e:
493 self.log.warn(
"Failed to %s %s to %s: %s" % (mode, infile, outfile, e))
494 if not self.config.allowError:
495 raise RuntimeError(f
"Failed to {mode} {infile} to {outfile}")
from e
500 """Return whether the file qualifies as bad 502 We match against the list of bad file patterns. 504 filename = os.path.basename(filename)
507 for badFile
in badFileList:
508 if fnmatch(filename, badFile):
513 """Return whether the file information qualifies as bad 515 We match against the list of bad data identifiers. 519 for badId
in badIdList:
520 if all(info[key] == value
for key, value
in badId.items()):
525 """!Expand a set of filenames and globs, returning a list of filenames 527 @param fileNameList A list of files and glob patterns 529 N.b. globs obey Posix semantics, so a pattern that matches nothing is returned unchanged 532 for globPattern
in fileNameList:
533 files = glob(globPattern)
536 self.log.warn(
"%s doesn't match any file" % globPattern)
539 filenameList.extend(files)
543 def runFile(self, infile, registry, args, pos):
544 """!Examine and ingest a single file 546 @param infile: File to process 547 @param registry: Registry into which to insert Butler metadata 548 @param args: Parsed command-line arguments 549 @param pos: Position number of this file in the input list 552 self.log.info(
"Skipping declared bad file %s" % infile)
555 fileInfo, hduInfoList = self.parse.getInfo(infile)
556 except Exception
as e:
557 if not self.config.allowError:
558 raise RuntimeError(f
"Error parsing {infile}")
from e
559 self.log.warn(
"Error parsing %s (%s); skipping" % (infile, e))
561 if self.
isBadId(fileInfo, args.badId.idList):
562 self.log.info(
"Skipping declared bad file %s: %s" % (infile, fileInfo))
564 if registry
is not None and self.register.check(registry, fileInfo):
565 if args.ignoreIngested:
567 self.log.warn(
"%s: already ingested: %s" % (infile, fileInfo))
568 outfile = self.parse.getDestination(args.butler, fileInfo, infile)
569 if not self.
ingest(infile, outfile, mode=args.mode, dryrun=args.dryrun):
571 if hduInfoList
is None:
573 for info
in hduInfoList:
575 self.register.addRow(registry, info, dryrun=args.dryrun, create=args.create)
576 except Exception
as exc:
577 raise IngestError(f
"Failed to register file {infile}", infile, pos)
from exc
580 """Ingest all specified files and add them to the registry""" 583 context = self.register.openRegistry(root, create=args.create, dryrun=args.dryrun)
584 with context
as registry:
585 for pos
in range(len(filenameList)):
586 infile = filenameList[pos]
588 self.
runFile(infile, registry, args, pos)
589 except Exception
as exc:
590 self.log.warn(
"Failed to ingest file %s: %s", infile, exc)
591 if not self.config.allowError:
592 raise IngestError(f
"Failed to ingest file {infile}", infile, pos)
from exc
596 """Ingest specified file or list of files and add them to the registry. 598 This method can only be called if `prepareTask` was used. 602 fileList : `str` or `list` [`str`] 603 Pathname or list of pathnames of files to ingest. 605 if not hasattr(self,
"_args"):
606 raise RuntimeError(
"Task not created with prepareTask")
607 if isinstance(fileList, str):
608 fileList = [fileList]
609 self._args.files = fileList
614 """Can I copy a file? Raise an exception is space constraints not met. 616 @param fromPath Path from which the file is being copied 617 @param toPath Path to which the file is being copied 619 req = os.stat(fromPath).st_size
620 st = os.statvfs(os.path.dirname(toPath))
621 avail = st.f_bavail * st.f_frsize
623 raise RuntimeError(
"Insufficient space: %d vs %d" % (req, avail))
def ingest(self, infile, outfile, mode="move", dryrun=False)
def ingestFiles(self, fileList)
def __init__(self, message, pathname, position)
def translate_filter(self, md)
def expandFiles(self, fileNameList)
Expand a set of filenames and globs, returning a list of filenames.
def translate_date(self, md)
def __exit__(self, excType, excValue, traceback)
def getInfo(self, filename)
def getInfoFromMetadata(self, md, info=None)
def getDestination(self, butler, info, filename)
def isBadFile(self, filename, badFileList)
def __init__(self, registryName, createTableFunc, forceCreateTables, permissions)
def createTable(self, conn, table=None, forceCreateTables=False)
def assertCanCopy(fromPath, toPath)
def runFile(self, infile, registry, args, pos)
Examine and ingest a single file.
def check(self, conn, info, table=None)
def __init__(self, args, kwargs)
def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3")
def prepareTask(cls, root=None, dryrun=False, mode="move", create=False, ignoreIngested=False)
def __init__(self, args, kwargs)
def addRow(self, conn, info, dryrun=False, create=False, table=None)
def isBadId(self, info, badIdList)