lsst.pipe.tasks  15.0-7-g6bb3a066+2
ingest.py
Go to the documentation of this file.
1 from past.builtins import basestring
2 import os
3 import shutil
4 import tempfile
5 import sqlite3
6 from fnmatch import fnmatch
7 from glob import glob
8 from contextlib import contextmanager
9 
10 from lsst.pex.config import Config, Field, DictField, ListField, ConfigurableField
12 from lsst.afw.fits import readMetadata
13 from lsst.pipe.base import Task, InputOnlyArgumentParser
14 from lsst.afw.fits import DEFAULT_HDU
15 
16 
17 class IngestArgumentParser(InputOnlyArgumentParser):
18  """Argument parser to support ingesting images into the image repository"""
19 
20  def __init__(self, *args, **kwargs):
21  super(IngestArgumentParser, self).__init__(*args, **kwargs)
22  self.add_argument("-n", "--dry-run", dest="dryrun", action="store_true", default=False,
23  help="Don't perform any action?")
24  self.add_argument("--mode", choices=["move", "copy", "link", "skip"], default="link",
25  help="Mode of delivering the files to their destination")
26  self.add_argument("--create", action="store_true", help="Create new registry (clobber old)?")
27  self.add_argument("--ignore-ingested", dest="ignoreIngested", action="store_true",
28  help="Don't register files that have already been registered")
29  self.add_id_argument("--badId", "raw", "Data identifier for bad data", doMakeDataRefList=False)
30  self.add_argument("--badFile", nargs="*", default=[],
31  help="Names of bad files (no path; wildcards allowed)")
32  self.add_argument("files", nargs="+", help="Names of file")
33 
34 
35 class ParseConfig(Config):
36  """Configuration for ParseTask"""
37  translation = DictField(keytype=str, itemtype=str, default={},
38  doc="Translation table for property --> header")
39  translators = DictField(keytype=str, itemtype=str, default={},
40  doc="Properties and name of translator method")
41  defaults = DictField(keytype=str, itemtype=str, default={},
42  doc="Default values if header is not present")
43  hdu = Field(dtype=int, default=DEFAULT_HDU, doc="HDU to read for metadata")
44  extnames = ListField(dtype=str, default=[], doc="Extension names to search for")
45 
46 
47 class ParseTask(Task):
48  """Task that will parse the filename and/or its contents to get the required information
49  for putting the file in the correct location and populating the registry."""
50  ConfigClass = ParseConfig
51 
52  def getInfo(self, filename):
53  """Get information about the image from the filename and its contents
54 
55  Here, we open the image and parse the header, but one could also look at the filename itself
56  and derive information from that, or set values from the configuration.
57 
58  @param filename Name of file to inspect
59  @return File properties; list of file properties for each extension
60  """
61  md = readMetadata(filename, self.config.hdu)
62  phuInfo = self.getInfoFromMetadata(md)
63  if len(self.config.extnames) == 0:
64  # No extensions to worry about
65  return phuInfo, [phuInfo]
66  # Look in the provided extensions
67  extnames = set(self.config.extnames)
68  extnum = 0
69  infoList = []
70  while len(extnames) > 0:
71  extnum += 1
72  try:
73  md = readMetadata(filename, extnum)
74  except:
75  self.log.warn("Error reading %s extensions %s" % (filename, extnames))
76  break
77  ext = self.getExtensionName(md)
78  if ext in extnames:
79  hduInfo = self.getInfoFromMetadata(md, info=phuInfo.copy())
80  # We need the HDU number when registering MEF files.
81  hduInfo["hdu"] = extnum
82  infoList.append(hduInfo)
83  extnames.discard(ext)
84  return phuInfo, infoList
85 
86  @staticmethod
88  """ Get the name of an extension.
89  @param md: PropertySet like one obtained from lsst.afw.fits.readMetadata)
90  @return Name of the extension if it exists. None otherwise.
91  """
92  try:
93  # This returns a tuple
94  ext = md.get("EXTNAME")
95  return ext[1]
97  return None
98 
99  def getInfoFromMetadata(self, md, info=None):
100  """Attempt to pull the desired information out of the header
101 
102  This is done through two mechanisms:
103  * translation: a property is set directly from the relevant header keyword
104  * translator: a property is set with the result of calling a method
105 
106  The translator methods receive the header metadata and should return the
107  appropriate value, or None if the value cannot be determined.
108 
109  @param md FITS header
110  @param info File properties, to be supplemented
111  @return info
112  """
113  if info is None:
114  info = {}
115  for p, h in self.config.translation.items():
116  if md.exists(h):
117  value = md.get(h)
118  if isinstance(value, basestring):
119  value = value.strip()
120  info[p] = value
121  elif p in self.config.defaults:
122  info[p] = self.config.defaults[p]
123  else:
124  self.log.warn("Unable to find value for %s (derived from %s)" % (p, h))
125  for p, t in self.config.translators.items():
126  func = getattr(self, t)
127  try:
128  value = func(md)
129  except Exception as e:
130  self.log.warn("%s failed to translate %s: %s", t, p, e)
131  value = None
132  if value is not None:
133  info[p] = value
134  return info
135 
136  def translate_date(self, md):
137  """Convert a full DATE-OBS to a mere date
138 
139  Besides being an example of a translator, this is also generally useful.
140  It will only be used if listed as a translator in the configuration.
141  """
142  date = md.get("DATE-OBS").strip()
143  c = date.find("T")
144  if c > 0:
145  date = date[:c]
146  return date
147 
148  def translate_filter(self, md):
149  """Translate a full filter description into a mere filter name
150 
151  Besides being an example of a translator, this is also generally useful.
152  It will only be used if listed as a translator in the configuration.
153  """
154  filterName = md.get("FILTER").strip()
155  filterName = filterName.strip()
156  c = filterName.find(" ")
157  if c > 0:
158  filterName = filterName[:c]
159  return filterName
160 
161  def getDestination(self, butler, info, filename):
162  """Get destination for the file
163 
164  @param butler Data butler
165  @param info File properties, used as dataId for the butler
166  @param filename Input filename
167  @return Destination filename
168  """
169  raw = butler.get("raw_filename", info)[0]
170  # Ensure filename is devoid of cfitsio directions about HDUs
171  c = raw.find("[")
172  if c > 0:
173  raw = raw[:c]
174  return raw
175 
176 
177 class RegisterConfig(Config):
178  """Configuration for the RegisterTask"""
179  table = Field(dtype=str, default="raw", doc="Name of table")
180  columns = DictField(keytype=str, itemtype=str, doc="List of columns for raw table, with their types",
181  itemCheck=lambda x: x in ("text", "int", "double"),
182  default={'object': 'text',
183  'visit': 'int',
184  'ccd': 'int',
185  'filter': 'text',
186  'date': 'text',
187  'taiObs': 'text',
188  'expTime': 'double',
189  },
190  )
191  unique = ListField(dtype=str, doc="List of columns to be declared unique for the table",
192  default=["visit", "ccd"])
193  visit = ListField(dtype=str, default=["visit", "object", "date", "filter"],
194  doc="List of columns for raw_visit table")
195  ignore = Field(dtype=bool, default=False, doc="Ignore duplicates in the table?")
196  permissions = Field(dtype=int, default=0o664, doc="Permissions mode for registry; 0o664 = rw-rw-r--")
197 
198 
200  """Context manager to provide a registry
201 
202  An existing registry is copied, so that it may continue
203  to be used while we add to this new registry. Finally,
204  the new registry is moved into the right place.
205  """
206 
207  def __init__(self, registryName, createTableFunc, forceCreateTables, permissions):
208  """Construct a context manager
209 
210  @param registryName: Name of registry file
211  @param createTableFunc: Function to create tables
212  @param forceCreateTables: Force the (re-)creation of tables?
213  @param permissions: Permissions to set on database file
214  """
215  self.registryName = registryName
216  self.permissions = permissions
217 
218  updateFile = tempfile.NamedTemporaryFile(prefix=registryName, dir=os.path.dirname(self.registryName),
219  delete=False)
220  self.updateName = updateFile.name
221 
222  haveTable = False
223  if os.path.exists(registryName):
224  assertCanCopy(registryName, self.updateName)
225  os.chmod(self.updateName, os.stat(registryName).st_mode)
226  shutil.copyfile(registryName, self.updateName)
227  haveTable = True
228 
229  self.conn = sqlite3.connect(self.updateName)
230  if not haveTable or forceCreateTables:
231  createTableFunc(self.conn)
232  os.chmod(self.updateName, self.permissions)
233 
234  def __enter__(self):
235  """Provide the 'as' value"""
236  return self.conn
237 
238  def __exit__(self, excType, excValue, traceback):
239  self.conn.commit()
240  self.conn.close()
241  if excType is None:
243  if os.path.exists(self.registryName):
244  os.unlink(self.registryName)
245  os.rename(self.updateName, self.registryName)
246  os.chmod(self.registryName, self.permissions)
247  return False # Don't suppress any exceptions
248 
249 
250 @contextmanager
252  """A context manager that doesn't provide any context
253 
254  Useful for dry runs where we don't want to actually do anything real.
255  """
256  yield
257 
258 
259 class RegisterTask(Task):
260  """Task that will generate the registry for the Mapper"""
261  ConfigClass = RegisterConfig
262  placeHolder = '?' # Placeholder for parameter substitution; this value suitable for sqlite3
263  typemap = {'text': str, 'int': int, 'double': float} # Mapping database type --> python type
264 
265  def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3"):
266  """Open the registry and return the connection handle.
267 
268  @param directory Directory in which the registry file will be placed
269  @param create Clobber any existing registry and create a new one?
270  @param dryrun Don't do anything permanent?
271  @param name Filename of the registry
272  @return Database connection
273  """
274  if dryrun:
275  return fakeContext()
276 
277  registryName = os.path.join(directory, name)
278  context = RegistryContext(registryName, self.createTable, create, self.config.permissions)
279  return context
280 
281  def createTable(self, conn, table=None):
282  """Create the registry tables
283 
284  One table (typically 'raw') contains information on all files, and the
285  other (typically 'raw_visit') contains information on all visits.
286 
287  @param conn Database connection
288  @param table Name of table to create in database
289  """
290  if table is None:
291  table = self.config.table
292  cmd = "create table %s (id integer primary key autoincrement, " % table
293  cmd += ",".join([("%s %s" % (col, colType)) for col, colType in self.config.columns.items()])
294  if len(self.config.unique) > 0:
295  cmd += ", unique(" + ",".join(self.config.unique) + ")"
296  cmd += ")"
297  conn.cursor().execute(cmd)
298 
299  cmd = "create table %s_visit (" % table
300  cmd += ",".join([("%s %s" % (col, self.config.columns[col])) for col in self.config.visit])
301  cmd += ", unique(" + ",".join(set(self.config.visit).intersection(set(self.config.unique))) + ")"
302  cmd += ")"
303  conn.cursor().execute(cmd)
304 
305  conn.commit()
306 
307  def check(self, conn, info, table=None):
308  """Check for the presence of a row already
309 
310  Not sure this is required, given the 'ignore' configuration option.
311  """
312  if table is None:
313  table = self.config.table
314  if self.config.ignore or len(self.config.unique) == 0:
315  return False # Our entry could already be there, but we don't care
316  cursor = conn.cursor()
317  sql = "SELECT COUNT(*) FROM %s WHERE " % table
318  sql += " AND ".join(["%s = %s" % (col, self.placeHolder) for col in self.config.unique])
319  values = [self.typemap[self.config.columns[col]](info[col]) for col in self.config.unique]
320 
321  cursor.execute(sql, values)
322  if cursor.fetchone()[0] > 0:
323  return True
324  return False
325 
326  def addRow(self, conn, info, dryrun=False, create=False, table=None):
327  """Add a row to the file table (typically 'raw').
328 
329  @param conn Database connection
330  @param info File properties to add to database
331  @param table Name of table in database
332  """
333  if table is None:
334  table = self.config.table
335  sql = "INSERT INTO %s (%s) SELECT " % (table, ",".join(self.config.columns))
336  sql += ",".join([self.placeHolder] * len(self.config.columns))
337  values = [self.typemap[tt](info[col]) for col, tt in self.config.columns.items()]
338 
339  if self.config.ignore:
340  sql += " WHERE NOT EXISTS (SELECT 1 FROM %s WHERE " % table
341  sql += " AND ".join(["%s=%s" % (col, self.placeHolder) for col in self.config.unique])
342  sql += ")"
343  values += [info[col] for col in self.config.unique]
344 
345  if dryrun:
346  print("Would execute: '%s' with %s" % (sql, ",".join([str(value) for value in values])))
347  else:
348  conn.cursor().execute(sql, values)
349 
350  def addVisits(self, conn, dryrun=False, table=None):
351  """Generate the visits table (typically 'raw_visits') from the
352  file table (typically 'raw').
353 
354  @param conn Database connection
355  @param table Name of table in database
356  """
357  if table is None:
358  table = self.config.table
359  sql = "INSERT INTO %s_visit SELECT DISTINCT " % table
360  sql += ",".join(self.config.visit)
361  sql += " FROM %s AS vv1" % table
362  sql += " WHERE NOT EXISTS "
363  sql += "(SELECT vv2.visit FROM %s_visit AS vv2 WHERE vv1.visit = vv2.visit)" % (table,)
364  if dryrun:
365  print("Would execute: %s" % sql)
366  else:
367  conn.cursor().execute(sql)
368 
369 
370 class IngestConfig(Config):
371  """Configuration for IngestTask"""
372  parse = ConfigurableField(target=ParseTask, doc="File parsing")
373  register = ConfigurableField(target=RegisterTask, doc="Registry entry")
374  allowError = Field(dtype=bool, default=False, doc="Allow error in ingestion?")
375  clobber = Field(dtype=bool, default=False, doc="Clobber existing file?")
376 
377 
378 class IngestTask(Task):
379  """Task that will ingest images into the data repository"""
380  ConfigClass = IngestConfig
381  ArgumentParser = IngestArgumentParser
382  _DefaultName = "ingest"
383 
384  def __init__(self, *args, **kwargs):
385  super(IngestTask, self).__init__(*args, **kwargs)
386  self.makeSubtask("parse")
387  self.makeSubtask("register")
388 
389  @classmethod
390  def parseAndRun(cls):
391  """Parse the command-line arguments and run the Task"""
392  config = cls.ConfigClass()
393  parser = cls.ArgumentParser(name=cls._DefaultName)
394  args = parser.parse_args(config)
395  task = cls(config=args.config)
396  task.run(args)
397 
398  def ingest(self, infile, outfile, mode="move", dryrun=False):
399  """Ingest a file into the image repository.
400 
401  @param infile Name of input file
402  @param outfile Name of output file (file in repository)
403  @param mode Mode of ingest (copy/link/move/skip)
404  @param dryrun Only report what would occur?
405  @param Success boolean
406  """
407  if mode == "skip":
408  return True
409  if dryrun:
410  self.log.info("Would %s from %s to %s" % (mode, infile, outfile))
411  return True
412  try:
413  outdir = os.path.dirname(outfile)
414  if not os.path.isdir(outdir):
415  try:
416  os.makedirs(outdir)
417  except:
418  # Silently ignore mkdir failures due to race conditions
419  if not os.path.isdir(outdir):
420  raise
421  if os.path.lexists(outfile):
422  if self.config.clobber:
423  os.unlink(outfile)
424  else:
425  raise RuntimeError("File %s already exists; consider --config clobber=True" % outfile)
426 
427  if mode == "copy":
428  assertCanCopy(infile, outfile)
429  shutil.copyfile(infile, outfile)
430  elif mode == "link":
431  os.symlink(os.path.abspath(infile), outfile)
432  elif mode == "move":
433  assertCanCopy(infile, outfile)
434  os.rename(infile, outfile)
435  else:
436  raise AssertionError("Unknown mode: %s" % mode)
437  self.log.info("%s --<%s>--> %s" % (infile, mode, outfile))
438  except Exception as e:
439  self.log.warn("Failed to %s %s to %s: %s" % (mode, infile, outfile, e))
440  if not self.config.allowError:
441  raise
442  return False
443  return True
444 
445  def isBadFile(self, filename, badFileList):
446  """Return whether the file qualifies as bad
447 
448  We match against the list of bad file patterns.
449  """
450  filename = os.path.basename(filename)
451  if not badFileList:
452  return False
453  for badFile in badFileList:
454  if fnmatch(filename, badFile):
455  return True
456  return False
457 
458  def isBadId(self, info, badIdList):
459  """Return whether the file information qualifies as bad
460 
461  We match against the list of bad data identifiers.
462  """
463  if not badIdList:
464  return False
465  for badId in badIdList:
466  if all(info[key] == value for key, value in badId.items()):
467  return True
468  return False
469 
470  def expandFiles(self, fileNameList):
471  """!Expand a set of filenames and globs, returning a list of filenames
472 
473  \param fileNameList A list of files and glob patterns
474 
475  N.b. globs obey Posix semantics, so a pattern that matches nothing is returned unchanged
476  """
477  filenameList = []
478  for globPattern in fileNameList:
479  files = glob(globPattern)
480 
481  if not files: # posix behaviour is to return pattern unchanged
482  self.log.warn("%s doesn't match any file" % globPattern)
483  continue
484 
485  filenameList.extend(files)
486 
487  return filenameList
488 
489  def runFile(self, infile, registry, args):
490  """!Examine and ingest a single file
491 
492  @param infile: File to process
493  @param args: Parsed command-line arguments
494  @return parsed information from FITS HDUs or None
495  """
496  if self.isBadFile(infile, args.badFile):
497  self.log.info("Skipping declared bad file %s" % infile)
498  return None
499  try:
500  fileInfo, hduInfoList = self.parse.getInfo(infile)
501  except Exception as e:
502  if not self.config.allowError:
503  raise
504  self.log.warn("Error parsing %s (%s); skipping" % (infile, e))
505  return None
506  if self.isBadId(fileInfo, args.badId.idList):
507  self.log.info("Skipping declared bad file %s: %s" % (infile, fileInfo))
508  return
509  if registry is not None and self.register.check(registry, fileInfo):
510  if args.ignoreIngested:
511  return None
512  self.log.warn("%s: already ingested: %s" % (infile, fileInfo))
513  outfile = self.parse.getDestination(args.butler, fileInfo, infile)
514  if not self.ingest(infile, outfile, mode=args.mode, dryrun=args.dryrun):
515  return None
516  return hduInfoList
517 
518  def run(self, args):
519  """Ingest all specified files and add them to the registry"""
520  filenameList = self.expandFiles(args.files)
521  root = args.input
522  context = self.register.openRegistry(root, create=args.create, dryrun=args.dryrun)
523  with context as registry:
524  for infile in filenameList:
525  try:
526  hduInfoList = self.runFile(infile, registry, args)
527  except Exception as exc:
528  self.log.warn("Failed to ingest file %s: %s", infile, exc)
529  continue
530  if hduInfoList is None:
531  continue
532  for info in hduInfoList:
533  self.register.addRow(registry, info, dryrun=args.dryrun, create=args.create)
534  self.register.addVisits(registry, dryrun=args.dryrun)
535 
536 
537 def assertCanCopy(fromPath, toPath):
538  """Can I copy a file? Raise an exception is space constraints not met.
539 
540  @param fromPath Path from which the file is being copied
541  @param toPath Path to which the file is being copied
542  """
543  req = os.stat(fromPath).st_size
544  st = os.statvfs(os.path.dirname(toPath))
545  avail = st.f_bavail * st.f_frsize
546  if avail < req:
547  raise RuntimeError("Insufficient space: %d vs %d" % (req, avail))
def ingest(self, infile, outfile, mode="move", dryrun=False)
Definition: ingest.py:398
def createTable(self, conn, table=None)
Definition: ingest.py:281
def expandFiles(self, fileNameList)
Expand a set of filenames and globs, returning a list of filenames.
Definition: ingest.py:470
def __exit__(self, excType, excValue, traceback)
Definition: ingest.py:238
def getInfo(self, filename)
Definition: ingest.py:52
def getInfoFromMetadata(self, md, info=None)
Definition: ingest.py:99
def getDestination(self, butler, info, filename)
Definition: ingest.py:161
def runFile(self, infile, registry, args)
Examine and ingest a single file.
Definition: ingest.py:489
def isBadFile(self, filename, badFileList)
Definition: ingest.py:445
def __init__(self, registryName, createTableFunc, forceCreateTables, permissions)
Definition: ingest.py:207
def assertCanCopy(fromPath, toPath)
Definition: ingest.py:537
def check(self, conn, info, table=None)
Definition: ingest.py:307
def __init__(self, args, kwargs)
Definition: ingest.py:20
def addVisits(self, conn, dryrun=False, table=None)
Definition: ingest.py:350
def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3")
Definition: ingest.py:265
def __init__(self, args, kwargs)
Definition: ingest.py:384
def addRow(self, conn, info, dryrun=False, create=False, table=None)
Definition: ingest.py:326
def isBadId(self, info, badIdList)
Definition: ingest.py:458