lsst.pipe.tasks  13.0-66-gfbf2f2ce+5
ingest.py
Go to the documentation of this file.
1 from __future__ import absolute_import, division, print_function
2 from past.builtins import basestring
3 from builtins import object
4 import os
5 import shutil
6 import tempfile
7 try:
8  import sqlite3
9 except ImportError:
10  # try external pysqlite package; deprecated
11  import sqlite as sqlite3
12 from fnmatch import fnmatch
13 from glob import glob
14 from contextlib import contextmanager
15 
16 from lsst.pex.config import Config, Field, DictField, ListField, ConfigurableField
17 import lsst.pex.exceptions
18 from lsst.pipe.base import Task, InputOnlyArgumentParser
19 import lsst.afw.image as afwImage
20 
21 
22 class IngestArgumentParser(InputOnlyArgumentParser):
23  """Argument parser to support ingesting images into the image repository"""
24 
25  def __init__(self, *args, **kwargs):
26  super(IngestArgumentParser, self).__init__(*args, **kwargs)
27  self.add_argument("-n", "--dry-run", dest="dryrun", action="store_true", default=False,
28  help="Don't perform any action?")
29  self.add_argument("--mode", choices=["move", "copy", "link", "skip"], default="link",
30  help="Mode of delivering the files to their destination")
31  self.add_argument("--create", action="store_true", help="Create new registry (clobber old)?")
32  self.add_argument("--ignore-ingested", dest="ignoreIngested", action="store_true",
33  help="Don't register files that have already been registered")
34  self.add_id_argument("--badId", "raw", "Data identifier for bad data", doMakeDataRefList=False)
35  self.add_argument("--badFile", nargs="*", default=[],
36  help="Names of bad files (no path; wildcards allowed)")
37  self.add_argument("files", nargs="+", help="Names of file")
38 
39 
40 class ParseConfig(Config):
41  """Configuration for ParseTask"""
42  translation = DictField(keytype=str, itemtype=str, default={},
43  doc="Translation table for property --> header")
44  translators = DictField(keytype=str, itemtype=str, default={},
45  doc="Properties and name of translator method")
46  defaults = DictField(keytype=str, itemtype=str, default={},
47  doc="Default values if header is not present")
48  hdu = Field(dtype=int, default=0, doc="HDU to read for metadata")
49  extnames = ListField(dtype=str, default=[], doc="Extension names to search for")
50 
51 
52 class ParseTask(Task):
53  """Task that will parse the filename and/or its contents to get the required information
54  for putting the file in the correct location and populating the registry."""
55  ConfigClass = ParseConfig
56 
57  def getInfo(self, filename):
58  """Get information about the image from the filename and its contents
59 
60  Here, we open the image and parse the header, but one could also look at the filename itself
61  and derive information from that, or set values from the configuration.
62 
63  @param filename Name of file to inspect
64  @return File properties; list of file properties for each extension
65  """
66  md = afwImage.readMetadata(filename, self.config.hdu)
67  phuInfo = self.getInfoFromMetadata(md)
68  if len(self.config.extnames) == 0:
69  # No extensions to worry about
70  return phuInfo, [phuInfo]
71  # Look in the provided extensions
72  extnames = set(self.config.extnames)
73  extnum = 0
74  infoList = []
75  while len(extnames) > 0:
76  extnum += 1
77  try:
78  md = afwImage.readMetadata(filename, extnum)
79  except:
80  self.log.warn("Error reading %s extensions %s" % (filename, extnames))
81  break
82  ext = self.getExtensionName(md)
83  if ext in extnames:
84  hduInfo = self.getInfoFromMetadata(md, info=phuInfo.copy())
85  # We need the HDU number when registering MEF files.
86  hduInfo["hdu"] = extnum
87  infoList.append(hduInfo)
88  extnames.discard(ext)
89  return phuInfo, infoList
90 
91  @staticmethod
93  """ Get the name of an extension.
94  @param md: PropertySet like one obtained from afwImage.readMetadata)
95  @return Name of the extension if it exists. None otherwise.
96  """
97  try:
98  # This returns a tuple
99  ext = md.get("EXTNAME")
100  return ext[1]
101  except lsst.pex.exceptions.Exception:
102  return None
103 
104  def getInfoFromMetadata(self, md, info={}):
105  """Attempt to pull the desired information out of the header
106 
107  This is done through two mechanisms:
108  * translation: a property is set directly from the relevant header keyword
109  * translator: a property is set with the result of calling a method
110 
111  The translator methods receive the header metadata and should return the
112  appropriate value, or None if the value cannot be determined.
113 
114  @param md FITS header
115  @param info File properties, to be supplemented
116  @return info
117  """
118  for p, h in self.config.translation.items():
119  if md.exists(h):
120  value = md.get(h)
121  if isinstance(value, basestring):
122  value = value.strip()
123  info[p] = value
124  elif p in self.config.defaults:
125  info[p] = self.config.defaults[p]
126  else:
127  self.log.warn("Unable to find value for %s (derived from %s)" % (p, h))
128  for p, t in self.config.translators.items():
129  func = getattr(self, t)
130  try:
131  value = func(md)
132  except Exception as e:
133  self.log.warn("%s failed to translate %s: %s", t, p, e)
134  value = None
135  if value is not None:
136  info[p] = value
137  return info
138 
139  def translate_date(self, md):
140  """Convert a full DATE-OBS to a mere date
141 
142  Besides being an example of a translator, this is also generally useful.
143  It will only be used if listed as a translator in the configuration.
144  """
145  date = md.get("DATE-OBS").strip()
146  c = date.find("T")
147  if c > 0:
148  date = date[:c]
149  return date
150 
151  def translate_filter(self, md):
152  """Translate a full filter description into a mere filter name
153 
154  Besides being an example of a translator, this is also generally useful.
155  It will only be used if listed as a translator in the configuration.
156  """
157  filterName = md.get("FILTER").strip()
158  filterName = filterName.strip()
159  c = filterName.find(" ")
160  if c > 0:
161  filterName = filterName[:c]
162  return filterName
163 
164  def getDestination(self, butler, info, filename):
165  """Get destination for the file
166 
167  @param butler Data butler
168  @param info File properties, used as dataId for the butler
169  @param filename Input filename
170  @return Destination filename
171  """
172  raw = butler.get("raw_filename", info)[0]
173  # Ensure filename is devoid of cfitsio directions about HDUs
174  c = raw.find("[")
175  if c > 0:
176  raw = raw[:c]
177  return raw
178 
179 
180 class RegisterConfig(Config):
181  """Configuration for the RegisterTask"""
182  table = Field(dtype=str, default="raw", doc="Name of table")
183  columns = DictField(keytype=str, itemtype=str, doc="List of columns for raw table, with their types",
184  itemCheck=lambda x: x in ("text", "int", "double"),
185  default={'object': 'text',
186  'visit': 'int',
187  'ccd': 'int',
188  'filter': 'text',
189  'date': 'text',
190  'taiObs': 'text',
191  'expTime': 'double',
192  },
193  )
194  unique = ListField(dtype=str, doc="List of columns to be declared unique for the table",
195  default=["visit", "ccd"])
196  visit = ListField(dtype=str, default=["visit", "object", "date", "filter"],
197  doc="List of columns for raw_visit table")
198  ignore = Field(dtype=bool, default=False, doc="Ignore duplicates in the table?")
199  permissions = Field(dtype=int, default=0o664, doc="Permissions mode for registry") # octal 664 = rw-rw-r--
200 
201 
202 class RegistryContext(object):
203  """Context manager to provide a registry
204 
205  An existing registry is copied, so that it may continue
206  to be used while we add to this new registry. Finally,
207  the new registry is moved into the right place.
208  """
209 
210  def __init__(self, registryName, createTableFunc, forceCreateTables, permissions):
211  """Construct a context manager
212 
213  @param registryName: Name of registry file
214  @param createTableFunc: Function to create tables
215  @param forceCreateTables: Force the (re-)creation of tables?
216  @param permissions: Permissions to set on database file
217  """
218  self.registryName = registryName
219  self.permissions = permissions
220 
221  updateFile = tempfile.NamedTemporaryFile(prefix=registryName, dir=os.path.dirname(self.registryName),
222  delete=False)
223  self.updateName = updateFile.name
224 
225  haveTable = False
226  if os.path.exists(registryName):
227  assertCanCopy(registryName, self.updateName)
228  os.chmod(self.updateName, os.stat(registryName).st_mode)
229  shutil.copyfile(registryName, self.updateName)
230  haveTable = True
231 
232  self.conn = sqlite3.connect(self.updateName)
233  if not haveTable or forceCreateTables:
234  createTableFunc(self.conn)
235  os.chmod(self.updateName, self.permissions)
236 
237  def __enter__(self):
238  """Provide the 'as' value"""
239  return self.conn
240 
241  def __exit__(self, excType, excValue, traceback):
242  self.conn.commit()
243  self.conn.close()
244  if excType is None:
246  if os.path.exists(self.registryName):
247  os.unlink(self.registryName)
248  os.rename(self.updateName, self.registryName)
249  os.chmod(self.registryName, self.permissions)
250  return False # Don't suppress any exceptions
251 
252 
253 @contextmanager
255  """A context manager that doesn't provide any context
256 
257  Useful for dry runs where we don't want to actually do anything real.
258  """
259  yield
260 
261 
262 class RegisterTask(Task):
263  """Task that will generate the registry for the Mapper"""
264  ConfigClass = RegisterConfig
265  placeHolder = '?' # Placeholder for parameter substitution; this value suitable for sqlite3
266  typemap = {'text': str, 'int': int, 'double': float} # Mapping database type --> python type
267 
268  def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3"):
269  """Open the registry and return the connection handle.
270 
271  @param directory Directory in which the registry file will be placed
272  @param create Clobber any existing registry and create a new one?
273  @param dryrun Don't do anything permanent?
274  @param name Filename of the registry
275  @return Database connection
276  """
277  if dryrun:
278  return fakeContext()
279 
280  registryName = os.path.join(directory, name)
281  context = RegistryContext(registryName, self.createTable, create, self.config.permissions)
282  return context
283 
284  def createTable(self, conn, table=None):
285  """Create the registry tables
286 
287  One table (typically 'raw') contains information on all files, and the
288  other (typically 'raw_visit') contains information on all visits.
289 
290  @param conn Database connection
291  @param table Name of table to create in database
292  """
293  if table is None:
294  table = self.config.table
295  cmd = "create table %s (id integer primary key autoincrement, " % table
296  cmd += ",".join([("%s %s" % (col, colType)) for col, colType in self.config.columns.items()])
297  if len(self.config.unique) > 0:
298  cmd += ", unique(" + ",".join(self.config.unique) + ")"
299  cmd += ")"
300  conn.cursor().execute(cmd)
301 
302  cmd = "create table %s_visit (" % table
303  cmd += ",".join([("%s %s" % (col, self.config.columns[col])) for col in self.config.visit])
304  cmd += ", unique(" + ",".join(set(self.config.visit).intersection(set(self.config.unique))) + ")"
305  cmd += ")"
306  conn.cursor().execute(cmd)
307 
308  conn.commit()
309 
310  def check(self, conn, info, table=None):
311  """Check for the presence of a row already
312 
313  Not sure this is required, given the 'ignore' configuration option.
314  """
315  if table is None:
316  table = self.config.table
317  if self.config.ignore or len(self.config.unique) == 0:
318  return False # Our entry could already be there, but we don't care
319  cursor = conn.cursor()
320  sql = "SELECT COUNT(*) FROM %s WHERE " % table
321  sql += " AND ".join(["%s = %s" % (col, self.placeHolder) for col in self.config.unique])
322  values = [self.typemap[self.config.columns[col]](info[col]) for col in self.config.unique]
323 
324  cursor.execute(sql, values)
325  if cursor.fetchone()[0] > 0:
326  return True
327  return False
328 
329  def addRow(self, conn, info, dryrun=False, create=False, table=None):
330  """Add a row to the file table (typically 'raw').
331 
332  @param conn Database connection
333  @param info File properties to add to database
334  @param table Name of table in database
335  """
336  if table is None:
337  table = self.config.table
338  sql = "INSERT INTO %s (%s) SELECT " % (table, ",".join(self.config.columns))
339  sql += ",".join([self.placeHolder] * len(self.config.columns))
340  values = [self.typemap[tt](info[col]) for col, tt in self.config.columns.items()]
341 
342  if self.config.ignore:
343  sql += " WHERE NOT EXISTS (SELECT 1 FROM %s WHERE " % table
344  sql += " AND ".join(["%s=%s" % (col, self.placeHolder) for col in self.config.unique])
345  sql += ")"
346  values += [info[col] for col in self.config.unique]
347 
348  if dryrun:
349  print("Would execute: '%s' with %s" % (sql, ",".join([str(value) for value in values])))
350  else:
351  conn.cursor().execute(sql, values)
352 
353  def addVisits(self, conn, dryrun=False, table=None):
354  """Generate the visits table (typically 'raw_visits') from the
355  file table (typically 'raw').
356 
357  @param conn Database connection
358  @param table Name of table in database
359  """
360  if table is None:
361  table = self.config.table
362  sql = "INSERT INTO %s_visit SELECT DISTINCT " % table
363  sql += ",".join(self.config.visit)
364  sql += " FROM %s AS vv1" % table
365  sql += " WHERE NOT EXISTS "
366  sql += "(SELECT vv2.visit FROM %s_visit AS vv2 WHERE vv1.visit = vv2.visit)" % (table,)
367  if dryrun:
368  print("Would execute: %s" % sql)
369  else:
370  conn.cursor().execute(sql)
371 
372 
373 class IngestConfig(Config):
374  """Configuration for IngestTask"""
375  parse = ConfigurableField(target=ParseTask, doc="File parsing")
376  register = ConfigurableField(target=RegisterTask, doc="Registry entry")
377  allowError = Field(dtype=bool, default=False, doc="Allow error in ingestion?")
378  clobber = Field(dtype=bool, default=False, doc="Clobber existing file?")
379 
380 
381 class IngestTask(Task):
382  """Task that will ingest images into the data repository"""
383  ConfigClass = IngestConfig
384  ArgumentParser = IngestArgumentParser
385  _DefaultName = "ingest"
386 
387  def __init__(self, *args, **kwargs):
388  super(IngestTask, self).__init__(*args, **kwargs)
389  self.makeSubtask("parse")
390  self.makeSubtask("register")
391 
392  @classmethod
393  def parseAndRun(cls):
394  """Parse the command-line arguments and run the Task"""
395  config = cls.ConfigClass()
396  parser = cls.ArgumentParser(name=cls._DefaultName)
397  args = parser.parse_args(config)
398  task = cls(config=args.config)
399  task.run(args)
400 
401  def ingest(self, infile, outfile, mode="move", dryrun=False):
402  """Ingest a file into the image repository.
403 
404  @param infile Name of input file
405  @param outfile Name of output file (file in repository)
406  @param mode Mode of ingest (copy/link/move/skip)
407  @param dryrun Only report what would occur?
408  @param Success boolean
409  """
410  if mode == "skip":
411  return True
412  if dryrun:
413  self.log.info("Would %s from %s to %s" % (mode, infile, outfile))
414  return True
415  try:
416  outdir = os.path.dirname(outfile)
417  if not os.path.isdir(outdir):
418  try:
419  os.makedirs(outdir)
420  except:
421  # Silently ignore mkdir failures due to race conditions
422  if not os.path.isdir(outdir):
423  raise
424  if os.path.lexists(outfile):
425  if self.config.clobber:
426  os.unlink(outfile)
427  else:
428  raise RuntimeError("File %s already exists; consider --config clobber=True" % outfile)
429 
430  if mode == "copy":
431  assertCanCopy(infile, outfile)
432  shutil.copyfile(infile, outfile)
433  elif mode == "link":
434  os.symlink(os.path.abspath(infile), outfile)
435  elif mode == "move":
436  assertCanCopy(infile, outfile)
437  os.rename(infile, outfile)
438  else:
439  raise AssertionError("Unknown mode: %s" % mode)
440  self.log.info("%s --<%s>--> %s" % (infile, mode, outfile))
441  except Exception as e:
442  self.log.warn("Failed to %s %s to %s: %s" % (mode, infile, outfile, e))
443  if not self.config.allowError:
444  raise
445  return False
446  return True
447 
448  def isBadFile(self, filename, badFileList):
449  """Return whether the file qualifies as bad
450 
451  We match against the list of bad file patterns.
452  """
453  filename = os.path.basename(filename)
454  if not badFileList:
455  return False
456  for badFile in badFileList:
457  if fnmatch(filename, badFile):
458  return True
459  return False
460 
461  def isBadId(self, info, badIdList):
462  """Return whether the file information qualifies as bad
463 
464  We match against the list of bad data identifiers.
465  """
466  if not badIdList:
467  return False
468  for badId in badIdList:
469  if all(info[key] == value for key, value in badId.items()):
470  return True
471  return False
472 
473  def expandFiles(self, fileNameList):
474  """!Expand a set of filenames and globs, returning a list of filenames
475 
476  \param fileNameList A list of files and glob patterns
477 
478  N.b. globs obey Posix semantics, so a pattern that matches nothing is returned unchanged
479  """
480  filenameList = []
481  for globPattern in fileNameList:
482  files = glob(globPattern)
483 
484  if not files: # posix behaviour is to return pattern unchanged
485  self.log.warn("%s doesn't match any file" % globPattern)
486  continue
487 
488  filenameList.extend(files)
489 
490  return filenameList
491 
492  def run(self, args):
493  """Ingest all specified files and add them to the registry"""
494  filenameList = self.expandFiles(args.files)
495  root = args.input
496  context = self.register.openRegistry(root, create=args.create, dryrun=args.dryrun)
497  with context as registry:
498  for infile in filenameList:
499  try:
500  if self.isBadFile(infile, args.badFile):
501  self.log.info("Skipping declared bad file %s" % infile)
502  continue
503  try:
504  fileInfo, hduInfoList = self.parse.getInfo(infile)
505  except Exception as e:
506  if not self.config.allowError:
507  raise
508  self.log.warn("Error parsing %s (%s); skipping" % (infile, e))
509  continue
510  if self.isBadId(fileInfo, args.badId.idList):
511  self.log.info("Skipping declared bad file %s: %s" % (infile, fileInfo))
512  continue
513  if self.register.check(registry, fileInfo):
514  if args.ignoreIngested:
515  continue
516 
517  self.log.warn("%s: already ingested: %s" % (infile, fileInfo))
518  outfile = self.parse.getDestination(args.butler, fileInfo, infile)
519  ingested = self.ingest(infile, outfile, mode=args.mode, dryrun=args.dryrun)
520  if not ingested:
521  continue
522  for info in hduInfoList:
523  self.register.addRow(registry, info, dryrun=args.dryrun, create=args.create)
524  except Exception as exc:
525  self.log.warn("Failed to ingest file %s: %s", infile, exc)
526  self.register.addVisits(registry, dryrun=args.dryrun)
527 
528 
529 def assertCanCopy(fromPath, toPath):
530  """Can I copy a file? Raise an exception is space constraints not met.
531 
532  @param fromPath Path from which the file is being copied
533  @param toPath Path to which the file is being copied
534  """
535  req = os.stat(fromPath).st_size
536  st = os.statvfs(os.path.dirname(toPath))
537  avail = st.f_bavail * st.f_frsize
538  if avail < req:
539  raise RuntimeError("Insufficient space: %d vs %d" % (req, avail))
def ingest(self, infile, outfile, mode="move", dryrun=False)
Definition: ingest.py:401
def createTable(self, conn, table=None)
Definition: ingest.py:284
def expandFiles(self, fileNameList)
Expand a set of filenames and globs, returning a list of filenames.
Definition: ingest.py:473
def __exit__(self, excType, excValue, traceback)
Definition: ingest.py:241
def getInfo(self, filename)
Definition: ingest.py:57
def getDestination(self, butler, info, filename)
Definition: ingest.py:164
def isBadFile(self, filename, badFileList)
Definition: ingest.py:448
def __init__(self, registryName, createTableFunc, forceCreateTables, permissions)
Definition: ingest.py:210
def getInfoFromMetadata(self, md, info={})
Definition: ingest.py:104
def assertCanCopy(fromPath, toPath)
Definition: ingest.py:529
def check(self, conn, info, table=None)
Definition: ingest.py:310
def __init__(self, args, kwargs)
Definition: ingest.py:25
def addVisits(self, conn, dryrun=False, table=None)
Definition: ingest.py:353
def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3")
Definition: ingest.py:268
def __init__(self, args, kwargs)
Definition: ingest.py:387
def addRow(self, conn, info, dryrun=False, create=False, table=None)
Definition: ingest.py:329
def isBadId(self, info, badIdList)
Definition: ingest.py:461