lsst.pipe.tasks  15.0-6-g9a9df217+12
ingest.py
Go to the documentation of this file.
1 from __future__ import absolute_import, division, print_function
2 from past.builtins import basestring
3 from builtins import object
4 import os
5 import shutil
6 import tempfile
7 import sqlite3
8 from fnmatch import fnmatch
9 from glob import glob
10 from contextlib import contextmanager
11 
12 from lsst.pex.config import Config, Field, DictField, ListField, ConfigurableField
14 from lsst.afw.fits import readMetadata
15 from lsst.pipe.base import Task, InputOnlyArgumentParser
16 from lsst.afw.fits import DEFAULT_HDU
17 
18 
19 class IngestArgumentParser(InputOnlyArgumentParser):
20  """Argument parser to support ingesting images into the image repository"""
21 
22  def __init__(self, *args, **kwargs):
23  super(IngestArgumentParser, self).__init__(*args, **kwargs)
24  self.add_argument("-n", "--dry-run", dest="dryrun", action="store_true", default=False,
25  help="Don't perform any action?")
26  self.add_argument("--mode", choices=["move", "copy", "link", "skip"], default="link",
27  help="Mode of delivering the files to their destination")
28  self.add_argument("--create", action="store_true", help="Create new registry (clobber old)?")
29  self.add_argument("--ignore-ingested", dest="ignoreIngested", action="store_true",
30  help="Don't register files that have already been registered")
31  self.add_id_argument("--badId", "raw", "Data identifier for bad data", doMakeDataRefList=False)
32  self.add_argument("--badFile", nargs="*", default=[],
33  help="Names of bad files (no path; wildcards allowed)")
34  self.add_argument("files", nargs="+", help="Names of file")
35 
36 
37 class ParseConfig(Config):
38  """Configuration for ParseTask"""
39  translation = DictField(keytype=str, itemtype=str, default={},
40  doc="Translation table for property --> header")
41  translators = DictField(keytype=str, itemtype=str, default={},
42  doc="Properties and name of translator method")
43  defaults = DictField(keytype=str, itemtype=str, default={},
44  doc="Default values if header is not present")
45  hdu = Field(dtype=int, default=DEFAULT_HDU, doc="HDU to read for metadata")
46  extnames = ListField(dtype=str, default=[], doc="Extension names to search for")
47 
48 
49 class ParseTask(Task):
50  """Task that will parse the filename and/or its contents to get the required information
51  for putting the file in the correct location and populating the registry."""
52  ConfigClass = ParseConfig
53 
54  def getInfo(self, filename):
55  """Get information about the image from the filename and its contents
56 
57  Here, we open the image and parse the header, but one could also look at the filename itself
58  and derive information from that, or set values from the configuration.
59 
60  @param filename Name of file to inspect
61  @return File properties; list of file properties for each extension
62  """
63  md = readMetadata(filename, self.config.hdu)
64  phuInfo = self.getInfoFromMetadata(md)
65  if len(self.config.extnames) == 0:
66  # No extensions to worry about
67  return phuInfo, [phuInfo]
68  # Look in the provided extensions
69  extnames = set(self.config.extnames)
70  extnum = 0
71  infoList = []
72  while len(extnames) > 0:
73  extnum += 1
74  try:
75  md = readMetadata(filename, extnum)
76  except:
77  self.log.warn("Error reading %s extensions %s" % (filename, extnames))
78  break
79  ext = self.getExtensionName(md)
80  if ext in extnames:
81  hduInfo = self.getInfoFromMetadata(md, info=phuInfo.copy())
82  # We need the HDU number when registering MEF files.
83  hduInfo["hdu"] = extnum
84  infoList.append(hduInfo)
85  extnames.discard(ext)
86  return phuInfo, infoList
87 
88  @staticmethod
90  """ Get the name of an extension.
91  @param md: PropertySet like one obtained from lsst.afw.fits.readMetadata)
92  @return Name of the extension if it exists. None otherwise.
93  """
94  try:
95  # This returns a tuple
96  ext = md.get("EXTNAME")
97  return ext[1]
99  return None
100 
101  def getInfoFromMetadata(self, md, info=None):
102  """Attempt to pull the desired information out of the header
103 
104  This is done through two mechanisms:
105  * translation: a property is set directly from the relevant header keyword
106  * translator: a property is set with the result of calling a method
107 
108  The translator methods receive the header metadata and should return the
109  appropriate value, or None if the value cannot be determined.
110 
111  @param md FITS header
112  @param info File properties, to be supplemented
113  @return info
114  """
115  if info is None:
116  info = {}
117  for p, h in self.config.translation.items():
118  if md.exists(h):
119  value = md.get(h)
120  if isinstance(value, basestring):
121  value = value.strip()
122  info[p] = value
123  elif p in self.config.defaults:
124  info[p] = self.config.defaults[p]
125  else:
126  self.log.warn("Unable to find value for %s (derived from %s)" % (p, h))
127  for p, t in self.config.translators.items():
128  func = getattr(self, t)
129  try:
130  value = func(md)
131  except Exception as e:
132  self.log.warn("%s failed to translate %s: %s", t, p, e)
133  value = None
134  if value is not None:
135  info[p] = value
136  return info
137 
138  def translate_date(self, md):
139  """Convert a full DATE-OBS to a mere date
140 
141  Besides being an example of a translator, this is also generally useful.
142  It will only be used if listed as a translator in the configuration.
143  """
144  date = md.get("DATE-OBS").strip()
145  c = date.find("T")
146  if c > 0:
147  date = date[:c]
148  return date
149 
150  def translate_filter(self, md):
151  """Translate a full filter description into a mere filter name
152 
153  Besides being an example of a translator, this is also generally useful.
154  It will only be used if listed as a translator in the configuration.
155  """
156  filterName = md.get("FILTER").strip()
157  filterName = filterName.strip()
158  c = filterName.find(" ")
159  if c > 0:
160  filterName = filterName[:c]
161  return filterName
162 
163  def getDestination(self, butler, info, filename):
164  """Get destination for the file
165 
166  @param butler Data butler
167  @param info File properties, used as dataId for the butler
168  @param filename Input filename
169  @return Destination filename
170  """
171  raw = butler.get("raw_filename", info)[0]
172  # Ensure filename is devoid of cfitsio directions about HDUs
173  c = raw.find("[")
174  if c > 0:
175  raw = raw[:c]
176  return raw
177 
178 
179 class RegisterConfig(Config):
180  """Configuration for the RegisterTask"""
181  table = Field(dtype=str, default="raw", doc="Name of table")
182  columns = DictField(keytype=str, itemtype=str, doc="List of columns for raw table, with their types",
183  itemCheck=lambda x: x in ("text", "int", "double"),
184  default={'object': 'text',
185  'visit': 'int',
186  'ccd': 'int',
187  'filter': 'text',
188  'date': 'text',
189  'taiObs': 'text',
190  'expTime': 'double',
191  },
192  )
193  unique = ListField(dtype=str, doc="List of columns to be declared unique for the table",
194  default=["visit", "ccd"])
195  visit = ListField(dtype=str, default=["visit", "object", "date", "filter"],
196  doc="List of columns for raw_visit table")
197  ignore = Field(dtype=bool, default=False, doc="Ignore duplicates in the table?")
198  permissions = Field(dtype=int, default=0o664, doc="Permissions mode for registry; 0o664 = rw-rw-r--")
199 
200 
201 class RegistryContext(object):
202  """Context manager to provide a registry
203 
204  An existing registry is copied, so that it may continue
205  to be used while we add to this new registry. Finally,
206  the new registry is moved into the right place.
207  """
208 
209  def __init__(self, registryName, createTableFunc, forceCreateTables, permissions):
210  """Construct a context manager
211 
212  @param registryName: Name of registry file
213  @param createTableFunc: Function to create tables
214  @param forceCreateTables: Force the (re-)creation of tables?
215  @param permissions: Permissions to set on database file
216  """
217  self.registryName = registryName
218  self.permissions = permissions
219 
220  updateFile = tempfile.NamedTemporaryFile(prefix=registryName, dir=os.path.dirname(self.registryName),
221  delete=False)
222  self.updateName = updateFile.name
223 
224  haveTable = False
225  if os.path.exists(registryName):
226  assertCanCopy(registryName, self.updateName)
227  os.chmod(self.updateName, os.stat(registryName).st_mode)
228  shutil.copyfile(registryName, self.updateName)
229  haveTable = True
230 
231  self.conn = sqlite3.connect(self.updateName)
232  if not haveTable or forceCreateTables:
233  createTableFunc(self.conn)
234  os.chmod(self.updateName, self.permissions)
235 
236  def __enter__(self):
237  """Provide the 'as' value"""
238  return self.conn
239 
240  def __exit__(self, excType, excValue, traceback):
241  self.conn.commit()
242  self.conn.close()
243  if excType is None:
245  if os.path.exists(self.registryName):
246  os.unlink(self.registryName)
247  os.rename(self.updateName, self.registryName)
248  os.chmod(self.registryName, self.permissions)
249  return False # Don't suppress any exceptions
250 
251 
252 @contextmanager
254  """A context manager that doesn't provide any context
255 
256  Useful for dry runs where we don't want to actually do anything real.
257  """
258  yield
259 
260 
261 class RegisterTask(Task):
262  """Task that will generate the registry for the Mapper"""
263  ConfigClass = RegisterConfig
264  placeHolder = '?' # Placeholder for parameter substitution; this value suitable for sqlite3
265  typemap = {'text': str, 'int': int, 'double': float} # Mapping database type --> python type
266 
267  def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3"):
268  """Open the registry and return the connection handle.
269 
270  @param directory Directory in which the registry file will be placed
271  @param create Clobber any existing registry and create a new one?
272  @param dryrun Don't do anything permanent?
273  @param name Filename of the registry
274  @return Database connection
275  """
276  if dryrun:
277  return fakeContext()
278 
279  registryName = os.path.join(directory, name)
280  context = RegistryContext(registryName, self.createTable, create, self.config.permissions)
281  return context
282 
283  def createTable(self, conn, table=None):
284  """Create the registry tables
285 
286  One table (typically 'raw') contains information on all files, and the
287  other (typically 'raw_visit') contains information on all visits.
288 
289  @param conn Database connection
290  @param table Name of table to create in database
291  """
292  if table is None:
293  table = self.config.table
294  cmd = "create table %s (id integer primary key autoincrement, " % table
295  cmd += ",".join([("%s %s" % (col, colType)) for col, colType in self.config.columns.items()])
296  if len(self.config.unique) > 0:
297  cmd += ", unique(" + ",".join(self.config.unique) + ")"
298  cmd += ")"
299  conn.cursor().execute(cmd)
300 
301  cmd = "create table %s_visit (" % table
302  cmd += ",".join([("%s %s" % (col, self.config.columns[col])) for col in self.config.visit])
303  cmd += ", unique(" + ",".join(set(self.config.visit).intersection(set(self.config.unique))) + ")"
304  cmd += ")"
305  conn.cursor().execute(cmd)
306 
307  conn.commit()
308 
309  def check(self, conn, info, table=None):
310  """Check for the presence of a row already
311 
312  Not sure this is required, given the 'ignore' configuration option.
313  """
314  if table is None:
315  table = self.config.table
316  if self.config.ignore or len(self.config.unique) == 0:
317  return False # Our entry could already be there, but we don't care
318  cursor = conn.cursor()
319  sql = "SELECT COUNT(*) FROM %s WHERE " % table
320  sql += " AND ".join(["%s = %s" % (col, self.placeHolder) for col in self.config.unique])
321  values = [self.typemap[self.config.columns[col]](info[col]) for col in self.config.unique]
322 
323  cursor.execute(sql, values)
324  if cursor.fetchone()[0] > 0:
325  return True
326  return False
327 
328  def addRow(self, conn, info, dryrun=False, create=False, table=None):
329  """Add a row to the file table (typically 'raw').
330 
331  @param conn Database connection
332  @param info File properties to add to database
333  @param table Name of table in database
334  """
335  if table is None:
336  table = self.config.table
337  sql = "INSERT INTO %s (%s) SELECT " % (table, ",".join(self.config.columns))
338  sql += ",".join([self.placeHolder] * len(self.config.columns))
339  values = [self.typemap[tt](info[col]) for col, tt in self.config.columns.items()]
340 
341  if self.config.ignore:
342  sql += " WHERE NOT EXISTS (SELECT 1 FROM %s WHERE " % table
343  sql += " AND ".join(["%s=%s" % (col, self.placeHolder) for col in self.config.unique])
344  sql += ")"
345  values += [info[col] for col in self.config.unique]
346 
347  if dryrun:
348  print("Would execute: '%s' with %s" % (sql, ",".join([str(value) for value in values])))
349  else:
350  conn.cursor().execute(sql, values)
351 
352  def addVisits(self, conn, dryrun=False, table=None):
353  """Generate the visits table (typically 'raw_visits') from the
354  file table (typically 'raw').
355 
356  @param conn Database connection
357  @param table Name of table in database
358  """
359  if table is None:
360  table = self.config.table
361  sql = "INSERT INTO %s_visit SELECT DISTINCT " % table
362  sql += ",".join(self.config.visit)
363  sql += " FROM %s AS vv1" % table
364  sql += " WHERE NOT EXISTS "
365  sql += "(SELECT vv2.visit FROM %s_visit AS vv2 WHERE vv1.visit = vv2.visit)" % (table,)
366  if dryrun:
367  print("Would execute: %s" % sql)
368  else:
369  conn.cursor().execute(sql)
370 
371 
372 class IngestConfig(Config):
373  """Configuration for IngestTask"""
374  parse = ConfigurableField(target=ParseTask, doc="File parsing")
375  register = ConfigurableField(target=RegisterTask, doc="Registry entry")
376  allowError = Field(dtype=bool, default=False, doc="Allow error in ingestion?")
377  clobber = Field(dtype=bool, default=False, doc="Clobber existing file?")
378 
379 
380 class IngestTask(Task):
381  """Task that will ingest images into the data repository"""
382  ConfigClass = IngestConfig
383  ArgumentParser = IngestArgumentParser
384  _DefaultName = "ingest"
385 
386  def __init__(self, *args, **kwargs):
387  super(IngestTask, self).__init__(*args, **kwargs)
388  self.makeSubtask("parse")
389  self.makeSubtask("register")
390 
391  @classmethod
392  def parseAndRun(cls):
393  """Parse the command-line arguments and run the Task"""
394  config = cls.ConfigClass()
395  parser = cls.ArgumentParser(name=cls._DefaultName)
396  args = parser.parse_args(config)
397  task = cls(config=args.config)
398  task.run(args)
399 
400  def ingest(self, infile, outfile, mode="move", dryrun=False):
401  """Ingest a file into the image repository.
402 
403  @param infile Name of input file
404  @param outfile Name of output file (file in repository)
405  @param mode Mode of ingest (copy/link/move/skip)
406  @param dryrun Only report what would occur?
407  @param Success boolean
408  """
409  if mode == "skip":
410  return True
411  if dryrun:
412  self.log.info("Would %s from %s to %s" % (mode, infile, outfile))
413  return True
414  try:
415  outdir = os.path.dirname(outfile)
416  if not os.path.isdir(outdir):
417  try:
418  os.makedirs(outdir)
419  except:
420  # Silently ignore mkdir failures due to race conditions
421  if not os.path.isdir(outdir):
422  raise
423  if os.path.lexists(outfile):
424  if self.config.clobber:
425  os.unlink(outfile)
426  else:
427  raise RuntimeError("File %s already exists; consider --config clobber=True" % outfile)
428 
429  if mode == "copy":
430  assertCanCopy(infile, outfile)
431  shutil.copyfile(infile, outfile)
432  elif mode == "link":
433  os.symlink(os.path.abspath(infile), outfile)
434  elif mode == "move":
435  assertCanCopy(infile, outfile)
436  os.rename(infile, outfile)
437  else:
438  raise AssertionError("Unknown mode: %s" % mode)
439  self.log.info("%s --<%s>--> %s" % (infile, mode, outfile))
440  except Exception as e:
441  self.log.warn("Failed to %s %s to %s: %s" % (mode, infile, outfile, e))
442  if not self.config.allowError:
443  raise
444  return False
445  return True
446 
447  def isBadFile(self, filename, badFileList):
448  """Return whether the file qualifies as bad
449 
450  We match against the list of bad file patterns.
451  """
452  filename = os.path.basename(filename)
453  if not badFileList:
454  return False
455  for badFile in badFileList:
456  if fnmatch(filename, badFile):
457  return True
458  return False
459 
460  def isBadId(self, info, badIdList):
461  """Return whether the file information qualifies as bad
462 
463  We match against the list of bad data identifiers.
464  """
465  if not badIdList:
466  return False
467  for badId in badIdList:
468  if all(info[key] == value for key, value in badId.items()):
469  return True
470  return False
471 
472  def expandFiles(self, fileNameList):
473  """!Expand a set of filenames and globs, returning a list of filenames
474 
475  \param fileNameList A list of files and glob patterns
476 
477  N.b. globs obey Posix semantics, so a pattern that matches nothing is returned unchanged
478  """
479  filenameList = []
480  for globPattern in fileNameList:
481  files = glob(globPattern)
482 
483  if not files: # posix behaviour is to return pattern unchanged
484  self.log.warn("%s doesn't match any file" % globPattern)
485  continue
486 
487  filenameList.extend(files)
488 
489  return filenameList
490 
491  def runFile(self, infile, registry, args):
492  """!Examine and ingest a single file
493 
494  @param infile: File to process
495  @param args: Parsed command-line arguments
496  @return parsed information from FITS HDUs or None
497  """
498  if self.isBadFile(infile, args.badFile):
499  self.log.info("Skipping declared bad file %s" % infile)
500  return None
501  try:
502  fileInfo, hduInfoList = self.parse.getInfo(infile)
503  except Exception as e:
504  if not self.config.allowError:
505  raise
506  self.log.warn("Error parsing %s (%s); skipping" % (infile, e))
507  return None
508  if self.isBadId(fileInfo, args.badId.idList):
509  self.log.info("Skipping declared bad file %s: %s" % (infile, fileInfo))
510  return
511  if registry is not None and self.register.check(registry, fileInfo):
512  if args.ignoreIngested:
513  return None
514  self.log.warn("%s: already ingested: %s" % (infile, fileInfo))
515  outfile = self.parse.getDestination(args.butler, fileInfo, infile)
516  if not self.ingest(infile, outfile, mode=args.mode, dryrun=args.dryrun):
517  return None
518  return hduInfoList
519 
520  def run(self, args):
521  """Ingest all specified files and add them to the registry"""
522  filenameList = self.expandFiles(args.files)
523  root = args.input
524  context = self.register.openRegistry(root, create=args.create, dryrun=args.dryrun)
525  with context as registry:
526  for infile in filenameList:
527  try:
528  hduInfoList = self.runFile(infile, registry, args)
529  except Exception as exc:
530  self.log.warn("Failed to ingest file %s: %s", infile, exc)
531  continue
532  if hduInfoList is None:
533  continue
534  for info in hduInfoList:
535  self.register.addRow(registry, info, dryrun=args.dryrun, create=args.create)
536  self.register.addVisits(registry, dryrun=args.dryrun)
537 
538 
539 def assertCanCopy(fromPath, toPath):
540  """Can I copy a file? Raise an exception is space constraints not met.
541 
542  @param fromPath Path from which the file is being copied
543  @param toPath Path to which the file is being copied
544  """
545  req = os.stat(fromPath).st_size
546  st = os.statvfs(os.path.dirname(toPath))
547  avail = st.f_bavail * st.f_frsize
548  if avail < req:
549  raise RuntimeError("Insufficient space: %d vs %d" % (req, avail))
def ingest(self, infile, outfile, mode="move", dryrun=False)
Definition: ingest.py:400
def createTable(self, conn, table=None)
Definition: ingest.py:283
def expandFiles(self, fileNameList)
Expand a set of filenames and globs, returning a list of filenames.
Definition: ingest.py:472
def __exit__(self, excType, excValue, traceback)
Definition: ingest.py:240
def getInfo(self, filename)
Definition: ingest.py:54
def getInfoFromMetadata(self, md, info=None)
Definition: ingest.py:101
def getDestination(self, butler, info, filename)
Definition: ingest.py:163
def runFile(self, infile, registry, args)
Examine and ingest a single file.
Definition: ingest.py:491
def isBadFile(self, filename, badFileList)
Definition: ingest.py:447
def __init__(self, registryName, createTableFunc, forceCreateTables, permissions)
Definition: ingest.py:209
def assertCanCopy(fromPath, toPath)
Definition: ingest.py:539
def check(self, conn, info, table=None)
Definition: ingest.py:309
def __init__(self, args, kwargs)
Definition: ingest.py:22
def addVisits(self, conn, dryrun=False, table=None)
Definition: ingest.py:352
def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3")
Definition: ingest.py:267
def __init__(self, args, kwargs)
Definition: ingest.py:386
def addRow(self, conn, info, dryrun=False, create=False, table=None)
Definition: ingest.py:328
def isBadId(self, info, badIdList)
Definition: ingest.py:460