lsst.pipe.tasks  14.0-33-gee53dea0+1
ingest.py
Go to the documentation of this file.
1 from __future__ import absolute_import, division, print_function
2 from past.builtins import basestring
3 from builtins import object
4 import os
5 import shutil
6 import tempfile
7 try:
8  import sqlite3
9 except ImportError:
10  # try external pysqlite package; deprecated
11  import sqlite as sqlite3
12 from fnmatch import fnmatch
13 from glob import glob
14 from contextlib import contextmanager
15 
16 from lsst.pex.config import Config, Field, DictField, ListField, ConfigurableField
18 from lsst.pipe.base import Task, InputOnlyArgumentParser
19 import lsst.afw.image as afwImage
20 from lsst.afw.fits import DEFAULT_HDU
21 
22 
23 class IngestArgumentParser(InputOnlyArgumentParser):
24  """Argument parser to support ingesting images into the image repository"""
25 
26  def __init__(self, *args, **kwargs):
27  super(IngestArgumentParser, self).__init__(*args, **kwargs)
28  self.add_argument("-n", "--dry-run", dest="dryrun", action="store_true", default=False,
29  help="Don't perform any action?")
30  self.add_argument("--mode", choices=["move", "copy", "link", "skip"], default="link",
31  help="Mode of delivering the files to their destination")
32  self.add_argument("--create", action="store_true", help="Create new registry (clobber old)?")
33  self.add_argument("--ignore-ingested", dest="ignoreIngested", action="store_true",
34  help="Don't register files that have already been registered")
35  self.add_id_argument("--badId", "raw", "Data identifier for bad data", doMakeDataRefList=False)
36  self.add_argument("--badFile", nargs="*", default=[],
37  help="Names of bad files (no path; wildcards allowed)")
38  self.add_argument("files", nargs="+", help="Names of file")
39 
40 
41 class ParseConfig(Config):
42  """Configuration for ParseTask"""
43  translation = DictField(keytype=str, itemtype=str, default={},
44  doc="Translation table for property --> header")
45  translators = DictField(keytype=str, itemtype=str, default={},
46  doc="Properties and name of translator method")
47  defaults = DictField(keytype=str, itemtype=str, default={},
48  doc="Default values if header is not present")
49  hdu = Field(dtype=int, default=DEFAULT_HDU, doc="HDU to read for metadata")
50  extnames = ListField(dtype=str, default=[], doc="Extension names to search for")
51 
52 
53 class ParseTask(Task):
54  """Task that will parse the filename and/or its contents to get the required information
55  for putting the file in the correct location and populating the registry."""
56  ConfigClass = ParseConfig
57 
58  def getInfo(self, filename):
59  """Get information about the image from the filename and its contents
60 
61  Here, we open the image and parse the header, but one could also look at the filename itself
62  and derive information from that, or set values from the configuration.
63 
64  @param filename Name of file to inspect
65  @return File properties; list of file properties for each extension
66  """
67  md = afwImage.readMetadata(filename, self.config.hdu)
68  phuInfo = self.getInfoFromMetadata(md)
69  if len(self.config.extnames) == 0:
70  # No extensions to worry about
71  return phuInfo, [phuInfo]
72  # Look in the provided extensions
73  extnames = set(self.config.extnames)
74  extnum = 0
75  infoList = []
76  while len(extnames) > 0:
77  extnum += 1
78  try:
79  md = afwImage.readMetadata(filename, extnum)
80  except:
81  self.log.warn("Error reading %s extensions %s" % (filename, extnames))
82  break
83  ext = self.getExtensionName(md)
84  if ext in extnames:
85  hduInfo = self.getInfoFromMetadata(md, info=phuInfo.copy())
86  # We need the HDU number when registering MEF files.
87  hduInfo["hdu"] = extnum
88  infoList.append(hduInfo)
89  extnames.discard(ext)
90  return phuInfo, infoList
91 
92  @staticmethod
94  """ Get the name of an extension.
95  @param md: PropertySet like one obtained from afwImage.readMetadata)
96  @return Name of the extension if it exists. None otherwise.
97  """
98  try:
99  # This returns a tuple
100  ext = md.get("EXTNAME")
101  return ext[1]
103  return None
104 
105  def getInfoFromMetadata(self, md, info={}):
106  """Attempt to pull the desired information out of the header
107 
108  This is done through two mechanisms:
109  * translation: a property is set directly from the relevant header keyword
110  * translator: a property is set with the result of calling a method
111 
112  The translator methods receive the header metadata and should return the
113  appropriate value, or None if the value cannot be determined.
114 
115  @param md FITS header
116  @param info File properties, to be supplemented
117  @return info
118  """
119  for p, h in self.config.translation.items():
120  if md.exists(h):
121  value = md.get(h)
122  if isinstance(value, basestring):
123  value = value.strip()
124  info[p] = value
125  elif p in self.config.defaults:
126  info[p] = self.config.defaults[p]
127  else:
128  self.log.warn("Unable to find value for %s (derived from %s)" % (p, h))
129  for p, t in self.config.translators.items():
130  func = getattr(self, t)
131  try:
132  value = func(md)
133  except Exception as e:
134  self.log.warn("%s failed to translate %s: %s", t, p, e)
135  value = None
136  if value is not None:
137  info[p] = value
138  return info
139 
140  def translate_date(self, md):
141  """Convert a full DATE-OBS to a mere date
142 
143  Besides being an example of a translator, this is also generally useful.
144  It will only be used if listed as a translator in the configuration.
145  """
146  date = md.get("DATE-OBS").strip()
147  c = date.find("T")
148  if c > 0:
149  date = date[:c]
150  return date
151 
152  def translate_filter(self, md):
153  """Translate a full filter description into a mere filter name
154 
155  Besides being an example of a translator, this is also generally useful.
156  It will only be used if listed as a translator in the configuration.
157  """
158  filterName = md.get("FILTER").strip()
159  filterName = filterName.strip()
160  c = filterName.find(" ")
161  if c > 0:
162  filterName = filterName[:c]
163  return filterName
164 
165  def getDestination(self, butler, info, filename):
166  """Get destination for the file
167 
168  @param butler Data butler
169  @param info File properties, used as dataId for the butler
170  @param filename Input filename
171  @return Destination filename
172  """
173  raw = butler.get("raw_filename", info)[0]
174  # Ensure filename is devoid of cfitsio directions about HDUs
175  c = raw.find("[")
176  if c > 0:
177  raw = raw[:c]
178  return raw
179 
180 
181 class RegisterConfig(Config):
182  """Configuration for the RegisterTask"""
183  table = Field(dtype=str, default="raw", doc="Name of table")
184  columns = DictField(keytype=str, itemtype=str, doc="List of columns for raw table, with their types",
185  itemCheck=lambda x: x in ("text", "int", "double"),
186  default={'object': 'text',
187  'visit': 'int',
188  'ccd': 'int',
189  'filter': 'text',
190  'date': 'text',
191  'taiObs': 'text',
192  'expTime': 'double',
193  },
194  )
195  unique = ListField(dtype=str, doc="List of columns to be declared unique for the table",
196  default=["visit", "ccd"])
197  visit = ListField(dtype=str, default=["visit", "object", "date", "filter"],
198  doc="List of columns for raw_visit table")
199  ignore = Field(dtype=bool, default=False, doc="Ignore duplicates in the table?")
200  permissions = Field(dtype=int, default=0o664, doc="Permissions mode for registry") # octal 664 = rw-rw-r--
201 
202 
203 class RegistryContext(object):
204  """Context manager to provide a registry
205 
206  An existing registry is copied, so that it may continue
207  to be used while we add to this new registry. Finally,
208  the new registry is moved into the right place.
209  """
210 
211  def __init__(self, registryName, createTableFunc, forceCreateTables, permissions):
212  """Construct a context manager
213 
214  @param registryName: Name of registry file
215  @param createTableFunc: Function to create tables
216  @param forceCreateTables: Force the (re-)creation of tables?
217  @param permissions: Permissions to set on database file
218  """
219  self.registryName = registryName
220  self.permissions = permissions
221 
222  updateFile = tempfile.NamedTemporaryFile(prefix=registryName, dir=os.path.dirname(self.registryName),
223  delete=False)
224  self.updateName = updateFile.name
225 
226  haveTable = False
227  if os.path.exists(registryName):
228  assertCanCopy(registryName, self.updateName)
229  os.chmod(self.updateName, os.stat(registryName).st_mode)
230  shutil.copyfile(registryName, self.updateName)
231  haveTable = True
232 
233  self.conn = sqlite3.connect(self.updateName)
234  if not haveTable or forceCreateTables:
235  createTableFunc(self.conn)
236  os.chmod(self.updateName, self.permissions)
237 
238  def __enter__(self):
239  """Provide the 'as' value"""
240  return self.conn
241 
242  def __exit__(self, excType, excValue, traceback):
243  self.conn.commit()
244  self.conn.close()
245  if excType is None:
247  if os.path.exists(self.registryName):
248  os.unlink(self.registryName)
249  os.rename(self.updateName, self.registryName)
250  os.chmod(self.registryName, self.permissions)
251  return False # Don't suppress any exceptions
252 
253 
254 @contextmanager
256  """A context manager that doesn't provide any context
257 
258  Useful for dry runs where we don't want to actually do anything real.
259  """
260  yield
261 
262 
263 class RegisterTask(Task):
264  """Task that will generate the registry for the Mapper"""
265  ConfigClass = RegisterConfig
266  placeHolder = '?' # Placeholder for parameter substitution; this value suitable for sqlite3
267  typemap = {'text': str, 'int': int, 'double': float} # Mapping database type --> python type
268 
269  def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3"):
270  """Open the registry and return the connection handle.
271 
272  @param directory Directory in which the registry file will be placed
273  @param create Clobber any existing registry and create a new one?
274  @param dryrun Don't do anything permanent?
275  @param name Filename of the registry
276  @return Database connection
277  """
278  if dryrun:
279  return fakeContext()
280 
281  registryName = os.path.join(directory, name)
282  context = RegistryContext(registryName, self.createTable, create, self.config.permissions)
283  return context
284 
285  def createTable(self, conn, table=None):
286  """Create the registry tables
287 
288  One table (typically 'raw') contains information on all files, and the
289  other (typically 'raw_visit') contains information on all visits.
290 
291  @param conn Database connection
292  @param table Name of table to create in database
293  """
294  if table is None:
295  table = self.config.table
296  cmd = "create table %s (id integer primary key autoincrement, " % table
297  cmd += ",".join([("%s %s" % (col, colType)) for col, colType in self.config.columns.items()])
298  if len(self.config.unique) > 0:
299  cmd += ", unique(" + ",".join(self.config.unique) + ")"
300  cmd += ")"
301  conn.cursor().execute(cmd)
302 
303  cmd = "create table %s_visit (" % table
304  cmd += ",".join([("%s %s" % (col, self.config.columns[col])) for col in self.config.visit])
305  cmd += ", unique(" + ",".join(set(self.config.visit).intersection(set(self.config.unique))) + ")"
306  cmd += ")"
307  conn.cursor().execute(cmd)
308 
309  conn.commit()
310 
311  def check(self, conn, info, table=None):
312  """Check for the presence of a row already
313 
314  Not sure this is required, given the 'ignore' configuration option.
315  """
316  if table is None:
317  table = self.config.table
318  if self.config.ignore or len(self.config.unique) == 0:
319  return False # Our entry could already be there, but we don't care
320  cursor = conn.cursor()
321  sql = "SELECT COUNT(*) FROM %s WHERE " % table
322  sql += " AND ".join(["%s = %s" % (col, self.placeHolder) for col in self.config.unique])
323  values = [self.typemap[self.config.columns[col]](info[col]) for col in self.config.unique]
324 
325  cursor.execute(sql, values)
326  if cursor.fetchone()[0] > 0:
327  return True
328  return False
329 
330  def addRow(self, conn, info, dryrun=False, create=False, table=None):
331  """Add a row to the file table (typically 'raw').
332 
333  @param conn Database connection
334  @param info File properties to add to database
335  @param table Name of table in database
336  """
337  if table is None:
338  table = self.config.table
339  sql = "INSERT INTO %s (%s) SELECT " % (table, ",".join(self.config.columns))
340  sql += ",".join([self.placeHolder] * len(self.config.columns))
341  values = [self.typemap[tt](info[col]) for col, tt in self.config.columns.items()]
342 
343  if self.config.ignore:
344  sql += " WHERE NOT EXISTS (SELECT 1 FROM %s WHERE " % table
345  sql += " AND ".join(["%s=%s" % (col, self.placeHolder) for col in self.config.unique])
346  sql += ")"
347  values += [info[col] for col in self.config.unique]
348 
349  if dryrun:
350  print("Would execute: '%s' with %s" % (sql, ",".join([str(value) for value in values])))
351  else:
352  conn.cursor().execute(sql, values)
353 
354  def addVisits(self, conn, dryrun=False, table=None):
355  """Generate the visits table (typically 'raw_visits') from the
356  file table (typically 'raw').
357 
358  @param conn Database connection
359  @param table Name of table in database
360  """
361  if table is None:
362  table = self.config.table
363  sql = "INSERT INTO %s_visit SELECT DISTINCT " % table
364  sql += ",".join(self.config.visit)
365  sql += " FROM %s AS vv1" % table
366  sql += " WHERE NOT EXISTS "
367  sql += "(SELECT vv2.visit FROM %s_visit AS vv2 WHERE vv1.visit = vv2.visit)" % (table,)
368  if dryrun:
369  print("Would execute: %s" % sql)
370  else:
371  conn.cursor().execute(sql)
372 
373 
374 class IngestConfig(Config):
375  """Configuration for IngestTask"""
376  parse = ConfigurableField(target=ParseTask, doc="File parsing")
377  register = ConfigurableField(target=RegisterTask, doc="Registry entry")
378  allowError = Field(dtype=bool, default=False, doc="Allow error in ingestion?")
379  clobber = Field(dtype=bool, default=False, doc="Clobber existing file?")
380 
381 
382 class IngestTask(Task):
383  """Task that will ingest images into the data repository"""
384  ConfigClass = IngestConfig
385  ArgumentParser = IngestArgumentParser
386  _DefaultName = "ingest"
387 
388  def __init__(self, *args, **kwargs):
389  super(IngestTask, self).__init__(*args, **kwargs)
390  self.makeSubtask("parse")
391  self.makeSubtask("register")
392 
393  @classmethod
394  def parseAndRun(cls):
395  """Parse the command-line arguments and run the Task"""
396  config = cls.ConfigClass()
397  parser = cls.ArgumentParser(name=cls._DefaultName)
398  args = parser.parse_args(config)
399  task = cls(config=args.config)
400  task.run(args)
401 
402  def ingest(self, infile, outfile, mode="move", dryrun=False):
403  """Ingest a file into the image repository.
404 
405  @param infile Name of input file
406  @param outfile Name of output file (file in repository)
407  @param mode Mode of ingest (copy/link/move/skip)
408  @param dryrun Only report what would occur?
409  @param Success boolean
410  """
411  if mode == "skip":
412  return True
413  if dryrun:
414  self.log.info("Would %s from %s to %s" % (mode, infile, outfile))
415  return True
416  try:
417  outdir = os.path.dirname(outfile)
418  if not os.path.isdir(outdir):
419  try:
420  os.makedirs(outdir)
421  except:
422  # Silently ignore mkdir failures due to race conditions
423  if not os.path.isdir(outdir):
424  raise
425  if os.path.lexists(outfile):
426  if self.config.clobber:
427  os.unlink(outfile)
428  else:
429  raise RuntimeError("File %s already exists; consider --config clobber=True" % outfile)
430 
431  if mode == "copy":
432  assertCanCopy(infile, outfile)
433  shutil.copyfile(infile, outfile)
434  elif mode == "link":
435  os.symlink(os.path.abspath(infile), outfile)
436  elif mode == "move":
437  assertCanCopy(infile, outfile)
438  os.rename(infile, outfile)
439  else:
440  raise AssertionError("Unknown mode: %s" % mode)
441  self.log.info("%s --<%s>--> %s" % (infile, mode, outfile))
442  except Exception as e:
443  self.log.warn("Failed to %s %s to %s: %s" % (mode, infile, outfile, e))
444  if not self.config.allowError:
445  raise
446  return False
447  return True
448 
449  def isBadFile(self, filename, badFileList):
450  """Return whether the file qualifies as bad
451 
452  We match against the list of bad file patterns.
453  """
454  filename = os.path.basename(filename)
455  if not badFileList:
456  return False
457  for badFile in badFileList:
458  if fnmatch(filename, badFile):
459  return True
460  return False
461 
462  def isBadId(self, info, badIdList):
463  """Return whether the file information qualifies as bad
464 
465  We match against the list of bad data identifiers.
466  """
467  if not badIdList:
468  return False
469  for badId in badIdList:
470  if all(info[key] == value for key, value in badId.items()):
471  return True
472  return False
473 
474  def expandFiles(self, fileNameList):
475  """!Expand a set of filenames and globs, returning a list of filenames
476 
477  \param fileNameList A list of files and glob patterns
478 
479  N.b. globs obey Posix semantics, so a pattern that matches nothing is returned unchanged
480  """
481  filenameList = []
482  for globPattern in fileNameList:
483  files = glob(globPattern)
484 
485  if not files: # posix behaviour is to return pattern unchanged
486  self.log.warn("%s doesn't match any file" % globPattern)
487  continue
488 
489  filenameList.extend(files)
490 
491  return filenameList
492 
493  def run(self, args):
494  """Ingest all specified files and add them to the registry"""
495  filenameList = self.expandFiles(args.files)
496  root = args.input
497  context = self.register.openRegistry(root, create=args.create, dryrun=args.dryrun)
498  with context as registry:
499  for infile in filenameList:
500  try:
501  if self.isBadFile(infile, args.badFile):
502  self.log.info("Skipping declared bad file %s" % infile)
503  continue
504  try:
505  fileInfo, hduInfoList = self.parse.getInfo(infile)
506  except Exception as e:
507  if not self.config.allowError:
508  raise
509  self.log.warn("Error parsing %s (%s); skipping" % (infile, e))
510  continue
511  if self.isBadId(fileInfo, args.badId.idList):
512  self.log.info("Skipping declared bad file %s: %s" % (infile, fileInfo))
513  continue
514  if self.register.check(registry, fileInfo):
515  if args.ignoreIngested:
516  continue
517 
518  self.log.warn("%s: already ingested: %s" % (infile, fileInfo))
519  outfile = self.parse.getDestination(args.butler, fileInfo, infile)
520  ingested = self.ingest(infile, outfile, mode=args.mode, dryrun=args.dryrun)
521  if not ingested:
522  continue
523  for info in hduInfoList:
524  self.register.addRow(registry, info, dryrun=args.dryrun, create=args.create)
525  except Exception as exc:
526  self.log.warn("Failed to ingest file %s: %s", infile, exc)
527  self.register.addVisits(registry, dryrun=args.dryrun)
528 
529 
530 def assertCanCopy(fromPath, toPath):
531  """Can I copy a file? Raise an exception is space constraints not met.
532 
533  @param fromPath Path from which the file is being copied
534  @param toPath Path to which the file is being copied
535  """
536  req = os.stat(fromPath).st_size
537  st = os.statvfs(os.path.dirname(toPath))
538  avail = st.f_bavail * st.f_frsize
539  if avail < req:
540  raise RuntimeError("Insufficient space: %d vs %d" % (req, avail))
def ingest(self, infile, outfile, mode="move", dryrun=False)
Definition: ingest.py:402
def createTable(self, conn, table=None)
Definition: ingest.py:285
def expandFiles(self, fileNameList)
Expand a set of filenames and globs, returning a list of filenames.
Definition: ingest.py:474
def __exit__(self, excType, excValue, traceback)
Definition: ingest.py:242
def getInfo(self, filename)
Definition: ingest.py:58
def getDestination(self, butler, info, filename)
Definition: ingest.py:165
def isBadFile(self, filename, badFileList)
Definition: ingest.py:449
def __init__(self, registryName, createTableFunc, forceCreateTables, permissions)
Definition: ingest.py:211
def getInfoFromMetadata(self, md, info={})
Definition: ingest.py:105
def assertCanCopy(fromPath, toPath)
Definition: ingest.py:530
def check(self, conn, info, table=None)
Definition: ingest.py:311
def __init__(self, args, kwargs)
Definition: ingest.py:26
def addVisits(self, conn, dryrun=False, table=None)
Definition: ingest.py:354
def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3")
Definition: ingest.py:269
def __init__(self, args, kwargs)
Definition: ingest.py:388
def addRow(self, conn, info, dryrun=False, create=False, table=None)
Definition: ingest.py:330
def isBadId(self, info, badIdList)
Definition: ingest.py:462