lsst.pipe.tasks  14.0-46-g76222d5f
ingest.py
Go to the documentation of this file.
1 from __future__ import absolute_import, division, print_function
2 from past.builtins import basestring
3 from builtins import object
4 import os
5 import shutil
6 import tempfile
7 try:
8  import sqlite3
9 except ImportError:
10  # try external pysqlite package; deprecated
11  import sqlite as sqlite3
12 from fnmatch import fnmatch
13 from glob import glob
14 from contextlib import contextmanager
15 
16 from lsst.pex.config import Config, Field, DictField, ListField, ConfigurableField
18 from lsst.pipe.base import Task, InputOnlyArgumentParser
19 import lsst.afw.image as afwImage
20 from lsst.afw.fits import DEFAULT_HDU
21 
22 
23 class IngestArgumentParser(InputOnlyArgumentParser):
24  """Argument parser to support ingesting images into the image repository"""
25 
26  def __init__(self, *args, **kwargs):
27  super(IngestArgumentParser, self).__init__(*args, **kwargs)
28  self.add_argument("-n", "--dry-run", dest="dryrun", action="store_true", default=False,
29  help="Don't perform any action?")
30  self.add_argument("--mode", choices=["move", "copy", "link", "skip"], default="link",
31  help="Mode of delivering the files to their destination")
32  self.add_argument("--create", action="store_true", help="Create new registry (clobber old)?")
33  self.add_argument("--ignore-ingested", dest="ignoreIngested", action="store_true",
34  help="Don't register files that have already been registered")
35  self.add_id_argument("--badId", "raw", "Data identifier for bad data", doMakeDataRefList=False)
36  self.add_argument("--badFile", nargs="*", default=[],
37  help="Names of bad files (no path; wildcards allowed)")
38  self.add_argument("files", nargs="+", help="Names of file")
39 
40 
41 class ParseConfig(Config):
42  """Configuration for ParseTask"""
43  translation = DictField(keytype=str, itemtype=str, default={},
44  doc="Translation table for property --> header")
45  translators = DictField(keytype=str, itemtype=str, default={},
46  doc="Properties and name of translator method")
47  defaults = DictField(keytype=str, itemtype=str, default={},
48  doc="Default values if header is not present")
49  hdu = Field(dtype=int, default=DEFAULT_HDU, doc="HDU to read for metadata")
50  extnames = ListField(dtype=str, default=[], doc="Extension names to search for")
51 
52 
53 class ParseTask(Task):
54  """Task that will parse the filename and/or its contents to get the required information
55  for putting the file in the correct location and populating the registry."""
56  ConfigClass = ParseConfig
57 
58  def getInfo(self, filename):
59  """Get information about the image from the filename and its contents
60 
61  Here, we open the image and parse the header, but one could also look at the filename itself
62  and derive information from that, or set values from the configuration.
63 
64  @param filename Name of file to inspect
65  @return File properties; list of file properties for each extension
66  """
67  md = afwImage.readMetadata(filename, self.config.hdu)
68  phuInfo = self.getInfoFromMetadata(md)
69  if len(self.config.extnames) == 0:
70  # No extensions to worry about
71  return phuInfo, [phuInfo]
72  # Look in the provided extensions
73  extnames = set(self.config.extnames)
74  extnum = 0
75  infoList = []
76  while len(extnames) > 0:
77  extnum += 1
78  try:
79  md = afwImage.readMetadata(filename, extnum)
80  except:
81  self.log.warn("Error reading %s extensions %s" % (filename, extnames))
82  break
83  ext = self.getExtensionName(md)
84  if ext in extnames:
85  hduInfo = self.getInfoFromMetadata(md, info=phuInfo.copy())
86  # We need the HDU number when registering MEF files.
87  hduInfo["hdu"] = extnum
88  infoList.append(hduInfo)
89  extnames.discard(ext)
90  return phuInfo, infoList
91 
92  @staticmethod
94  """ Get the name of an extension.
95  @param md: PropertySet like one obtained from afwImage.readMetadata)
96  @return Name of the extension if it exists. None otherwise.
97  """
98  try:
99  # This returns a tuple
100  ext = md.get("EXTNAME")
101  return ext[1]
103  return None
104 
105  def getInfoFromMetadata(self, md, info=None):
106  """Attempt to pull the desired information out of the header
107 
108  This is done through two mechanisms:
109  * translation: a property is set directly from the relevant header keyword
110  * translator: a property is set with the result of calling a method
111 
112  The translator methods receive the header metadata and should return the
113  appropriate value, or None if the value cannot be determined.
114 
115  @param md FITS header
116  @param info File properties, to be supplemented
117  @return info
118  """
119  if info is None:
120  info = {}
121  for p, h in self.config.translation.items():
122  if md.exists(h):
123  value = md.get(h)
124  if isinstance(value, basestring):
125  value = value.strip()
126  info[p] = value
127  elif p in self.config.defaults:
128  info[p] = self.config.defaults[p]
129  else:
130  self.log.warn("Unable to find value for %s (derived from %s)" % (p, h))
131  for p, t in self.config.translators.items():
132  func = getattr(self, t)
133  try:
134  value = func(md)
135  except Exception as e:
136  self.log.warn("%s failed to translate %s: %s", t, p, e)
137  value = None
138  if value is not None:
139  info[p] = value
140  return info
141 
142  def translate_date(self, md):
143  """Convert a full DATE-OBS to a mere date
144 
145  Besides being an example of a translator, this is also generally useful.
146  It will only be used if listed as a translator in the configuration.
147  """
148  date = md.get("DATE-OBS").strip()
149  c = date.find("T")
150  if c > 0:
151  date = date[:c]
152  return date
153 
154  def translate_filter(self, md):
155  """Translate a full filter description into a mere filter name
156 
157  Besides being an example of a translator, this is also generally useful.
158  It will only be used if listed as a translator in the configuration.
159  """
160  filterName = md.get("FILTER").strip()
161  filterName = filterName.strip()
162  c = filterName.find(" ")
163  if c > 0:
164  filterName = filterName[:c]
165  return filterName
166 
167  def getDestination(self, butler, info, filename):
168  """Get destination for the file
169 
170  @param butler Data butler
171  @param info File properties, used as dataId for the butler
172  @param filename Input filename
173  @return Destination filename
174  """
175  raw = butler.get("raw_filename", info)[0]
176  # Ensure filename is devoid of cfitsio directions about HDUs
177  c = raw.find("[")
178  if c > 0:
179  raw = raw[:c]
180  return raw
181 
182 
183 class RegisterConfig(Config):
184  """Configuration for the RegisterTask"""
185  table = Field(dtype=str, default="raw", doc="Name of table")
186  columns = DictField(keytype=str, itemtype=str, doc="List of columns for raw table, with their types",
187  itemCheck=lambda x: x in ("text", "int", "double"),
188  default={'object': 'text',
189  'visit': 'int',
190  'ccd': 'int',
191  'filter': 'text',
192  'date': 'text',
193  'taiObs': 'text',
194  'expTime': 'double',
195  },
196  )
197  unique = ListField(dtype=str, doc="List of columns to be declared unique for the table",
198  default=["visit", "ccd"])
199  visit = ListField(dtype=str, default=["visit", "object", "date", "filter"],
200  doc="List of columns for raw_visit table")
201  ignore = Field(dtype=bool, default=False, doc="Ignore duplicates in the table?")
202  permissions = Field(dtype=int, default=0o664, doc="Permissions mode for registry") # octal 664 = rw-rw-r--
203 
204 
205 class RegistryContext(object):
206  """Context manager to provide a registry
207 
208  An existing registry is copied, so that it may continue
209  to be used while we add to this new registry. Finally,
210  the new registry is moved into the right place.
211  """
212 
213  def __init__(self, registryName, createTableFunc, forceCreateTables, permissions):
214  """Construct a context manager
215 
216  @param registryName: Name of registry file
217  @param createTableFunc: Function to create tables
218  @param forceCreateTables: Force the (re-)creation of tables?
219  @param permissions: Permissions to set on database file
220  """
221  self.registryName = registryName
222  self.permissions = permissions
223 
224  updateFile = tempfile.NamedTemporaryFile(prefix=registryName, dir=os.path.dirname(self.registryName),
225  delete=False)
226  self.updateName = updateFile.name
227 
228  haveTable = False
229  if os.path.exists(registryName):
230  assertCanCopy(registryName, self.updateName)
231  os.chmod(self.updateName, os.stat(registryName).st_mode)
232  shutil.copyfile(registryName, self.updateName)
233  haveTable = True
234 
235  self.conn = sqlite3.connect(self.updateName)
236  if not haveTable or forceCreateTables:
237  createTableFunc(self.conn)
238  os.chmod(self.updateName, self.permissions)
239 
240  def __enter__(self):
241  """Provide the 'as' value"""
242  return self.conn
243 
244  def __exit__(self, excType, excValue, traceback):
245  self.conn.commit()
246  self.conn.close()
247  if excType is None:
249  if os.path.exists(self.registryName):
250  os.unlink(self.registryName)
251  os.rename(self.updateName, self.registryName)
252  os.chmod(self.registryName, self.permissions)
253  return False # Don't suppress any exceptions
254 
255 
256 @contextmanager
258  """A context manager that doesn't provide any context
259 
260  Useful for dry runs where we don't want to actually do anything real.
261  """
262  yield
263 
264 
265 class RegisterTask(Task):
266  """Task that will generate the registry for the Mapper"""
267  ConfigClass = RegisterConfig
268  placeHolder = '?' # Placeholder for parameter substitution; this value suitable for sqlite3
269  typemap = {'text': str, 'int': int, 'double': float} # Mapping database type --> python type
270 
271  def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3"):
272  """Open the registry and return the connection handle.
273 
274  @param directory Directory in which the registry file will be placed
275  @param create Clobber any existing registry and create a new one?
276  @param dryrun Don't do anything permanent?
277  @param name Filename of the registry
278  @return Database connection
279  """
280  if dryrun:
281  return fakeContext()
282 
283  registryName = os.path.join(directory, name)
284  context = RegistryContext(registryName, self.createTable, create, self.config.permissions)
285  return context
286 
287  def createTable(self, conn, table=None):
288  """Create the registry tables
289 
290  One table (typically 'raw') contains information on all files, and the
291  other (typically 'raw_visit') contains information on all visits.
292 
293  @param conn Database connection
294  @param table Name of table to create in database
295  """
296  if table is None:
297  table = self.config.table
298  cmd = "create table %s (id integer primary key autoincrement, " % table
299  cmd += ",".join([("%s %s" % (col, colType)) for col, colType in self.config.columns.items()])
300  if len(self.config.unique) > 0:
301  cmd += ", unique(" + ",".join(self.config.unique) + ")"
302  cmd += ")"
303  conn.cursor().execute(cmd)
304 
305  cmd = "create table %s_visit (" % table
306  cmd += ",".join([("%s %s" % (col, self.config.columns[col])) for col in self.config.visit])
307  cmd += ", unique(" + ",".join(set(self.config.visit).intersection(set(self.config.unique))) + ")"
308  cmd += ")"
309  conn.cursor().execute(cmd)
310 
311  conn.commit()
312 
313  def check(self, conn, info, table=None):
314  """Check for the presence of a row already
315 
316  Not sure this is required, given the 'ignore' configuration option.
317  """
318  if table is None:
319  table = self.config.table
320  if self.config.ignore or len(self.config.unique) == 0:
321  return False # Our entry could already be there, but we don't care
322  cursor = conn.cursor()
323  sql = "SELECT COUNT(*) FROM %s WHERE " % table
324  sql += " AND ".join(["%s = %s" % (col, self.placeHolder) for col in self.config.unique])
325  values = [self.typemap[self.config.columns[col]](info[col]) for col in self.config.unique]
326 
327  cursor.execute(sql, values)
328  if cursor.fetchone()[0] > 0:
329  return True
330  return False
331 
332  def addRow(self, conn, info, dryrun=False, create=False, table=None):
333  """Add a row to the file table (typically 'raw').
334 
335  @param conn Database connection
336  @param info File properties to add to database
337  @param table Name of table in database
338  """
339  if table is None:
340  table = self.config.table
341  sql = "INSERT INTO %s (%s) SELECT " % (table, ",".join(self.config.columns))
342  sql += ",".join([self.placeHolder] * len(self.config.columns))
343  values = [self.typemap[tt](info[col]) for col, tt in self.config.columns.items()]
344 
345  if self.config.ignore:
346  sql += " WHERE NOT EXISTS (SELECT 1 FROM %s WHERE " % table
347  sql += " AND ".join(["%s=%s" % (col, self.placeHolder) for col in self.config.unique])
348  sql += ")"
349  values += [info[col] for col in self.config.unique]
350 
351  if dryrun:
352  print("Would execute: '%s' with %s" % (sql, ",".join([str(value) for value in values])))
353  else:
354  conn.cursor().execute(sql, values)
355 
356  def addVisits(self, conn, dryrun=False, table=None):
357  """Generate the visits table (typically 'raw_visits') from the
358  file table (typically 'raw').
359 
360  @param conn Database connection
361  @param table Name of table in database
362  """
363  if table is None:
364  table = self.config.table
365  sql = "INSERT INTO %s_visit SELECT DISTINCT " % table
366  sql += ",".join(self.config.visit)
367  sql += " FROM %s AS vv1" % table
368  sql += " WHERE NOT EXISTS "
369  sql += "(SELECT vv2.visit FROM %s_visit AS vv2 WHERE vv1.visit = vv2.visit)" % (table,)
370  if dryrun:
371  print("Would execute: %s" % sql)
372  else:
373  conn.cursor().execute(sql)
374 
375 
376 class IngestConfig(Config):
377  """Configuration for IngestTask"""
378  parse = ConfigurableField(target=ParseTask, doc="File parsing")
379  register = ConfigurableField(target=RegisterTask, doc="Registry entry")
380  allowError = Field(dtype=bool, default=False, doc="Allow error in ingestion?")
381  clobber = Field(dtype=bool, default=False, doc="Clobber existing file?")
382 
383 
384 class IngestTask(Task):
385  """Task that will ingest images into the data repository"""
386  ConfigClass = IngestConfig
387  ArgumentParser = IngestArgumentParser
388  _DefaultName = "ingest"
389 
390  def __init__(self, *args, **kwargs):
391  super(IngestTask, self).__init__(*args, **kwargs)
392  self.makeSubtask("parse")
393  self.makeSubtask("register")
394 
395  @classmethod
396  def parseAndRun(cls):
397  """Parse the command-line arguments and run the Task"""
398  config = cls.ConfigClass()
399  parser = cls.ArgumentParser(name=cls._DefaultName)
400  args = parser.parse_args(config)
401  task = cls(config=args.config)
402  task.run(args)
403 
404  def ingest(self, infile, outfile, mode="move", dryrun=False):
405  """Ingest a file into the image repository.
406 
407  @param infile Name of input file
408  @param outfile Name of output file (file in repository)
409  @param mode Mode of ingest (copy/link/move/skip)
410  @param dryrun Only report what would occur?
411  @param Success boolean
412  """
413  if mode == "skip":
414  return True
415  if dryrun:
416  self.log.info("Would %s from %s to %s" % (mode, infile, outfile))
417  return True
418  try:
419  outdir = os.path.dirname(outfile)
420  if not os.path.isdir(outdir):
421  try:
422  os.makedirs(outdir)
423  except:
424  # Silently ignore mkdir failures due to race conditions
425  if not os.path.isdir(outdir):
426  raise
427  if os.path.lexists(outfile):
428  if self.config.clobber:
429  os.unlink(outfile)
430  else:
431  raise RuntimeError("File %s already exists; consider --config clobber=True" % outfile)
432 
433  if mode == "copy":
434  assertCanCopy(infile, outfile)
435  shutil.copyfile(infile, outfile)
436  elif mode == "link":
437  os.symlink(os.path.abspath(infile), outfile)
438  elif mode == "move":
439  assertCanCopy(infile, outfile)
440  os.rename(infile, outfile)
441  else:
442  raise AssertionError("Unknown mode: %s" % mode)
443  self.log.info("%s --<%s>--> %s" % (infile, mode, outfile))
444  except Exception as e:
445  self.log.warn("Failed to %s %s to %s: %s" % (mode, infile, outfile, e))
446  if not self.config.allowError:
447  raise
448  return False
449  return True
450 
451  def isBadFile(self, filename, badFileList):
452  """Return whether the file qualifies as bad
453 
454  We match against the list of bad file patterns.
455  """
456  filename = os.path.basename(filename)
457  if not badFileList:
458  return False
459  for badFile in badFileList:
460  if fnmatch(filename, badFile):
461  return True
462  return False
463 
464  def isBadId(self, info, badIdList):
465  """Return whether the file information qualifies as bad
466 
467  We match against the list of bad data identifiers.
468  """
469  if not badIdList:
470  return False
471  for badId in badIdList:
472  if all(info[key] == value for key, value in badId.items()):
473  return True
474  return False
475 
476  def expandFiles(self, fileNameList):
477  """!Expand a set of filenames and globs, returning a list of filenames
478 
479  \param fileNameList A list of files and glob patterns
480 
481  N.b. globs obey Posix semantics, so a pattern that matches nothing is returned unchanged
482  """
483  filenameList = []
484  for globPattern in fileNameList:
485  files = glob(globPattern)
486 
487  if not files: # posix behaviour is to return pattern unchanged
488  self.log.warn("%s doesn't match any file" % globPattern)
489  continue
490 
491  filenameList.extend(files)
492 
493  return filenameList
494 
495  def runFile(self, infile, registry, args):
496  """!Examine and ingest a single file
497 
498  @param infile: File to process
499  @param args: Parsed command-line arguments
500  @return parsed information from FITS HDUs or None
501  """
502  if self.isBadFile(infile, args.badFile):
503  self.log.info("Skipping declared bad file %s" % infile)
504  return None
505  try:
506  fileInfo, hduInfoList = self.parse.getInfo(infile)
507  except Exception as e:
508  if not self.config.allowError:
509  raise
510  self.log.warn("Error parsing %s (%s); skipping" % (infile, e))
511  return None
512  if self.isBadId(fileInfo, args.badId.idList):
513  self.log.info("Skipping declared bad file %s: %s" % (infile, fileInfo))
514  return
515  if registry is not None and self.register.check(registry, fileInfo):
516  if args.ignoreIngested:
517  return None
518  self.log.warn("%s: already ingested: %s" % (infile, fileInfo))
519  outfile = self.parse.getDestination(args.butler, fileInfo, infile)
520  if not self.ingest(infile, outfile, mode=args.mode, dryrun=args.dryrun):
521  return None
522  return hduInfoList
523 
524  def run(self, args):
525  """Ingest all specified files and add them to the registry"""
526  filenameList = self.expandFiles(args.files)
527  root = args.input
528  context = self.register.openRegistry(root, create=args.create, dryrun=args.dryrun)
529  with context as registry:
530  for infile in filenameList:
531  try:
532  hduInfoList = self.runFile(infile, registry, args)
533  except Exception as exc:
534  self.log.warn("Failed to ingest file %s: %s", infile, exc)
535  continue
536  if hduInfoList is None:
537  continue
538  for info in hduInfoList:
539  self.register.addRow(registry, info, dryrun=args.dryrun, create=args.create)
540  self.register.addVisits(registry, dryrun=args.dryrun)
541 
542 
543 def assertCanCopy(fromPath, toPath):
544  """Can I copy a file? Raise an exception is space constraints not met.
545 
546  @param fromPath Path from which the file is being copied
547  @param toPath Path to which the file is being copied
548  """
549  req = os.stat(fromPath).st_size
550  st = os.statvfs(os.path.dirname(toPath))
551  avail = st.f_bavail * st.f_frsize
552  if avail < req:
553  raise RuntimeError("Insufficient space: %d vs %d" % (req, avail))
def ingest(self, infile, outfile, mode="move", dryrun=False)
Definition: ingest.py:404
def createTable(self, conn, table=None)
Definition: ingest.py:287
def expandFiles(self, fileNameList)
Expand a set of filenames and globs, returning a list of filenames.
Definition: ingest.py:476
def __exit__(self, excType, excValue, traceback)
Definition: ingest.py:244
def getInfo(self, filename)
Definition: ingest.py:58
def getInfoFromMetadata(self, md, info=None)
Definition: ingest.py:105
def getDestination(self, butler, info, filename)
Definition: ingest.py:167
def runFile(self, infile, registry, args)
Examine and ingest a single file.
Definition: ingest.py:495
def isBadFile(self, filename, badFileList)
Definition: ingest.py:451
def __init__(self, registryName, createTableFunc, forceCreateTables, permissions)
Definition: ingest.py:213
def assertCanCopy(fromPath, toPath)
Definition: ingest.py:543
def check(self, conn, info, table=None)
Definition: ingest.py:313
def __init__(self, args, kwargs)
Definition: ingest.py:26
def addVisits(self, conn, dryrun=False, table=None)
Definition: ingest.py:356
def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3")
Definition: ingest.py:271
def __init__(self, args, kwargs)
Definition: ingest.py:390
def addRow(self, conn, info, dryrun=False, create=False, table=None)
Definition: ingest.py:332
def isBadId(self, info, badIdList)
Definition: ingest.py:464