lsst.pipe.tasks  13.0-29-g7046ce1+1
 All Classes Namespaces Files Functions Variables Groups Pages
ingest.py
Go to the documentation of this file.
1 from __future__ import absolute_import, division, print_function
2 from past.builtins import basestring
3 from builtins import object
4 import os
5 import shutil
6 import tempfile
7 try:
8  import sqlite3
9 except ImportError:
10  # try external pysqlite package; deprecated
11  import sqlite as sqlite3
12 from fnmatch import fnmatch
13 from glob import glob
14 from contextlib import contextmanager
15 
16 from lsst.pex.config import Config, Field, DictField, ListField, ConfigurableField
17 import lsst.pex.exceptions
18 from lsst.pipe.base import Task, InputOnlyArgumentParser
19 import lsst.afw.image as afwImage
20 
21 
22 class IngestArgumentParser(InputOnlyArgumentParser):
23  """Argument parser to support ingesting images into the image repository"""
24 
25  def __init__(self, *args, **kwargs):
26  super(IngestArgumentParser, self).__init__(*args, **kwargs)
27  self.add_argument("-n", "--dry-run", dest="dryrun", action="store_true", default=False,
28  help="Don't perform any action?")
29  self.add_argument("--mode", choices=["move", "copy", "link", "skip"], default="link",
30  help="Mode of delivering the files to their destination")
31  self.add_argument("--create", action="store_true", help="Create new registry (clobber old)?")
32  self.add_id_argument("--badId", "raw", "Data identifier for bad data", doMakeDataRefList=False)
33  self.add_argument("--badFile", nargs="*", default=[],
34  help="Names of bad files (no path; wildcards allowed)")
35  self.add_argument("files", nargs="+", help="Names of file")
36 
37 
38 class ParseConfig(Config):
39  """Configuration for ParseTask"""
40  translation = DictField(keytype=str, itemtype=str, default={},
41  doc="Translation table for property --> header")
42  translators = DictField(keytype=str, itemtype=str, default={},
43  doc="Properties and name of translator method")
44  defaults = DictField(keytype=str, itemtype=str, default={},
45  doc="Default values if header is not present")
46  hdu = Field(dtype=int, default=0, doc="HDU to read for metadata")
47  extnames = ListField(dtype=str, default=[], doc="Extension names to search for")
48 
49 
50 class ParseTask(Task):
51  """Task that will parse the filename and/or its contents to get the required information
52  for putting the file in the correct location and populating the registry."""
53  ConfigClass = ParseConfig
54 
55  def getInfo(self, filename):
56  """Get information about the image from the filename and its contents
57 
58  Here, we open the image and parse the header, but one could also look at the filename itself
59  and derive information from that, or set values from the configuration.
60 
61  @param filename Name of file to inspect
62  @return File properties; list of file properties for each extension
63  """
64  md = afwImage.readMetadata(filename, self.config.hdu)
65  phuInfo = self.getInfoFromMetadata(md)
66  if len(self.config.extnames) == 0:
67  # No extensions to worry about
68  return phuInfo, [phuInfo]
69  # Look in the provided extensions
70  extnames = set(self.config.extnames)
71  extnum = 0
72  infoList = []
73  while len(extnames) > 0:
74  extnum += 1
75  try:
76  md = afwImage.readMetadata(filename, extnum)
77  except:
78  self.log.warn("Error reading %s extensions %s" % (filename, extnames))
79  break
80  ext = self.getExtensionName(md)
81  if ext in extnames:
82  hduInfo = self.getInfoFromMetadata(md, info=phuInfo.copy())
83  # We need the HDU number when registering MEF files.
84  hduInfo["hdu"] = extnum
85  infoList.append(hduInfo)
86  extnames.discard(ext)
87  return phuInfo, infoList
88 
89  @staticmethod
91  """ Get the name of an extension.
92  @param md: PropertySet like one obtained from afwImage.readMetadata)
93  @return Name of the extension if it exists. None otherwise.
94  """
95  try:
96  # This returns a tuple
97  ext = md.get("EXTNAME")
98  return ext[1]
99  except lsst.pex.exceptions.Exception:
100  return None
101 
102  def getInfoFromMetadata(self, md, info={}):
103  """Attempt to pull the desired information out of the header
104 
105  This is done through two mechanisms:
106  * translation: a property is set directly from the relevant header keyword
107  * translator: a property is set with the result of calling a method
108 
109  The translator methods receive the header metadata and should return the
110  appropriate value, or None if the value cannot be determined.
111 
112  @param md FITS header
113  @param info File properties, to be supplemented
114  @return info
115  """
116  for p, h in self.config.translation.items():
117  if md.exists(h):
118  value = md.get(h)
119  if isinstance(value, basestring):
120  value = value.strip()
121  info[p] = value
122  elif p in self.config.defaults:
123  info[p] = self.config.defaults[p]
124  else:
125  self.log.warn("Unable to find value for %s (derived from %s)" % (p, h))
126  for p, t in self.config.translators.items():
127  func = getattr(self, t)
128  try:
129  value = func(md)
130  except Exception as e:
131  self.log.warn("%s failed to translate %s: %s", t, p, e)
132  value = None
133  if value is not None:
134  info[p] = value
135  return info
136 
137  def translate_date(self, md):
138  """Convert a full DATE-OBS to a mere date
139 
140  Besides being an example of a translator, this is also generally useful.
141  It will only be used if listed as a translator in the configuration.
142  """
143  date = md.get("DATE-OBS").strip()
144  c = date.find("T")
145  if c > 0:
146  date = date[:c]
147  return date
148 
149  def translate_filter(self, md):
150  """Translate a full filter description into a mere filter name
151 
152  Besides being an example of a translator, this is also generally useful.
153  It will only be used if listed as a translator in the configuration.
154  """
155  filterName = md.get("FILTER").strip()
156  filterName = filterName.strip()
157  c = filterName.find(" ")
158  if c > 0:
159  filterName = filterName[:c]
160  return filterName
161 
162  def getDestination(self, butler, info, filename):
163  """Get destination for the file
164 
165  @param butler Data butler
166  @param info File properties, used as dataId for the butler
167  @param filename Input filename
168  @return Destination filename
169  """
170  raw = butler.get("raw_filename", info)[0]
171  # Ensure filename is devoid of cfitsio directions about HDUs
172  c = raw.find("[")
173  if c > 0:
174  raw = raw[:c]
175  return raw
176 
177 
178 class RegisterConfig(Config):
179  """Configuration for the RegisterTask"""
180  table = Field(dtype=str, default="raw", doc="Name of table")
181  columns = DictField(keytype=str, itemtype=str, doc="List of columns for raw table, with their types",
182  itemCheck=lambda x: x in ("text", "int", "double"),
183  default={'object': 'text',
184  'visit': 'int',
185  'ccd': 'int',
186  'filter': 'text',
187  'date': 'text',
188  'taiObs': 'text',
189  'expTime': 'double',
190  },
191  )
192  unique = ListField(dtype=str, doc="List of columns to be declared unique for the table",
193  default=["visit", "ccd"])
194  visit = ListField(dtype=str, default=["visit", "object", "date", "filter"],
195  doc="List of columns for raw_visit table")
196  ignore = Field(dtype=bool, default=False, doc="Ignore duplicates in the table?")
197  permissions = Field(dtype=int, default=0o664, doc="Permissions mode for registry") # octal 664 = rw-rw-r--
198 
199 
200 class RegistryContext(object):
201  """Context manager to provide a registry
202 
203  An existing registry is copied, so that it may continue
204  to be used while we add to this new registry. Finally,
205  the new registry is moved into the right place.
206  """
207 
208  def __init__(self, registryName, createTableFunc, forceCreateTables, permissions):
209  """Construct a context manager
210 
211  @param registryName: Name of registry file
212  @param createTableFunc: Function to create tables
213  @param forceCreateTables: Force the (re-)creation of tables?
214  @param permissions: Permissions to set on database file
215  """
216  self.registryName = registryName
217  self.permissions = permissions
218 
219  updateFile = tempfile.NamedTemporaryFile(prefix=registryName, dir=os.path.dirname(self.registryName),
220  delete=False)
221  self.updateName = updateFile.name
222 
223  haveTable = False
224  if os.path.exists(registryName):
225  assertCanCopy(registryName, self.updateName)
226  os.chmod(self.updateName, os.stat(registryName).st_mode)
227  shutil.copyfile(registryName, self.updateName)
228  haveTable = True
229 
230  self.conn = sqlite3.connect(self.updateName)
231  if not haveTable or forceCreateTables:
232  createTableFunc(self.conn)
233  os.chmod(self.updateName, self.permissions)
234 
235  def __enter__(self):
236  """Provide the 'as' value"""
237  return self.conn
238 
239  def __exit__(self, excType, excValue, traceback):
240  self.conn.commit()
241  self.conn.close()
242  if excType is None:
244  if os.path.exists(self.registryName):
245  os.unlink(self.registryName)
246  os.rename(self.updateName, self.registryName)
247  os.chmod(self.registryName, self.permissions)
248  return False # Don't suppress any exceptions
249 
250 
251 @contextmanager
253  """A context manager that doesn't provide any context
254 
255  Useful for dry runs where we don't want to actually do anything real.
256  """
257  yield
258 
259 
260 class RegisterTask(Task):
261  """Task that will generate the registry for the Mapper"""
262  ConfigClass = RegisterConfig
263  placeHolder = '?' # Placeholder for parameter substitution; this value suitable for sqlite3
264  typemap = {'text': str, 'int': int, 'double': float} # Mapping database type --> python type
265 
266  def openRegistry(self, directory, create=False, dryrun=False, name="registry.sqlite3"):
267  """Open the registry and return the connection handle.
268 
269  @param directory Directory in which the registry file will be placed
270  @param create Clobber any existing registry and create a new one?
271  @param dryrun Don't do anything permanent?
272  @param name Filename of the registry
273  @return Database connection
274  """
275  if dryrun:
276  return fakeContext()
277 
278  registryName = os.path.join(directory, name)
279  context = RegistryContext(registryName, self.createTable, create, self.config.permissions)
280  return context
281 
282  def createTable(self, conn, table=None):
283  """Create the registry tables
284 
285  One table (typically 'raw') contains information on all files, and the
286  other (typically 'raw_visit') contains information on all visits.
287 
288  @param conn Database connection
289  @param table Name of table to create in database
290  """
291  if table is None:
292  table = self.config.table
293  cmd = "create table %s (id integer primary key autoincrement, " % table
294  cmd += ",".join([("%s %s" % (col, colType)) for col, colType in self.config.columns.items()])
295  if len(self.config.unique) > 0:
296  cmd += ", unique(" + ",".join(self.config.unique) + ")"
297  cmd += ")"
298  conn.cursor().execute(cmd)
299 
300  cmd = "create table %s_visit (" % table
301  cmd += ",".join([("%s %s" % (col, self.config.columns[col])) for col in self.config.visit])
302  cmd += ", unique(" + ",".join(set(self.config.visit).intersection(set(self.config.unique))) + ")"
303  cmd += ")"
304  conn.cursor().execute(cmd)
305 
306  conn.commit()
307 
308  def check(self, conn, info, table=None):
309  """Check for the presence of a row already
310 
311  Not sure this is required, given the 'ignore' configuration option.
312  """
313  if table is None:
314  table = self.config.table
315  if self.config.ignore or len(self.config.unique) == 0:
316  return False # Our entry could already be there, but we don't care
317  cursor = conn.cursor()
318  sql = "SELECT COUNT(*) FROM %s WHERE " % table
319  sql += " AND ".join(["%s = %s" % (col, self.placeHolder) for col in self.config.unique])
320  values = [self.typemap[self.config.columns[col]](info[col]) for col in self.config.unique]
321 
322  cursor.execute(sql, values)
323  if cursor.fetchone()[0] > 0:
324  return True
325  return False
326 
327  def addRow(self, conn, info, dryrun=False, create=False, table=None):
328  """Add a row to the file table (typically 'raw').
329 
330  @param conn Database connection
331  @param info File properties to add to database
332  @param table Name of table in database
333  """
334  if table is None:
335  table = self.config.table
336  sql = "INSERT INTO %s (%s) SELECT " % (table, ",".join(self.config.columns))
337  sql += ",".join([self.placeHolder] * len(self.config.columns))
338  values = [self.typemap[tt](info[col]) for col, tt in self.config.columns.items()]
339 
340  if self.config.ignore:
341  sql += " WHERE NOT EXISTS (SELECT 1 FROM %s WHERE " % self.config.table
342  sql += " AND ".join(["%s=%s" % (col, self.placeHolder) for col in self.config.unique])
343  sql += ")"
344  values += [info[col] for col in self.config.unique]
345 
346  if dryrun:
347  print("Would execute: '%s' with %s" % (sql, ",".join([str(value) for value in values])))
348  else:
349  conn.cursor().execute(sql, values)
350 
351  def addVisits(self, conn, dryrun=False, table=None):
352  """Generate the visits table (typically 'raw_visits') from the
353  file table (typically 'raw').
354 
355  @param conn Database connection
356  @param table Name of table in database
357  """
358  if table is None:
359  table = self.config.table
360  sql = "INSERT INTO %s_visit SELECT DISTINCT " % table
361  sql += ",".join(self.config.visit)
362  sql += " FROM %s AS vv1" % table
363  sql += " WHERE NOT EXISTS "
364  sql += "(SELECT vv2.visit FROM %s_visit AS vv2 WHERE vv1.visit = vv2.visit)" % (table,)
365  if dryrun:
366  print("Would execute: %s" % sql)
367  else:
368  conn.cursor().execute(sql)
369 
370 
371 class IngestConfig(Config):
372  """Configuration for IngestTask"""
373  parse = ConfigurableField(target=ParseTask, doc="File parsing")
374  register = ConfigurableField(target=RegisterTask, doc="Registry entry")
375  allowError = Field(dtype=bool, default=False, doc="Allow error in ingestion?")
376  clobber = Field(dtype=bool, default=False, doc="Clobber existing file?")
377 
378 
379 class IngestTask(Task):
380  """Task that will ingest images into the data repository"""
381  ConfigClass = IngestConfig
382  ArgumentParser = IngestArgumentParser
383  _DefaultName = "ingest"
384 
385  def __init__(self, *args, **kwargs):
386  super(IngestTask, self).__init__(*args, **kwargs)
387  self.makeSubtask("parse")
388  self.makeSubtask("register")
389 
390  @classmethod
391  def parseAndRun(cls):
392  """Parse the command-line arguments and run the Task"""
393  config = cls.ConfigClass()
394  parser = cls.ArgumentParser(name=cls._DefaultName)
395  args = parser.parse_args(config)
396  task = cls(config=args.config)
397  task.run(args)
398 
399  def ingest(self, infile, outfile, mode="move", dryrun=False):
400  """Ingest a file into the image repository.
401 
402  @param infile Name of input file
403  @param outfile Name of output file (file in repository)
404  @param mode Mode of ingest (copy/link/move/skip)
405  @param dryrun Only report what would occur?
406  @param Success boolean
407  """
408  if mode == "skip":
409  return True
410  if dryrun:
411  self.log.info("Would %s from %s to %s" % (mode, infile, outfile))
412  return True
413  try:
414  outdir = os.path.dirname(outfile)
415  if not os.path.isdir(outdir):
416  try:
417  os.makedirs(outdir)
418  except:
419  # Silently ignore mkdir failures due to race conditions
420  if not os.path.isdir(outdir):
421  raise
422  if self.config.clobber and os.path.lexists(outfile):
423  os.unlink(outfile)
424  if mode == "copy":
425  assertCanCopy(infile, outfile)
426  shutil.copyfile(infile, outfile)
427  elif mode == "link":
428  os.symlink(os.path.abspath(infile), outfile)
429  elif mode == "move":
430  assertCanCopy(infile, outfile)
431  os.rename(infile, outfile)
432  else:
433  raise AssertionError("Unknown mode: %s" % mode)
434  self.log.info("%s --<%s>--> %s" % (infile, mode, outfile))
435  except Exception as e:
436  self.log.warn("Failed to %s %s to %s: %s" % (mode, infile, outfile, e))
437  if not self.config.allowError:
438  raise
439  return False
440  return True
441 
442  def isBadFile(self, filename, badFileList):
443  """Return whether the file qualifies as bad
444 
445  We match against the list of bad file patterns.
446  """
447  filename = os.path.basename(filename)
448  if not badFileList:
449  return False
450  for badFile in badFileList:
451  if fnmatch(filename, badFile):
452  return True
453  return False
454 
455  def isBadId(self, info, badIdList):
456  """Return whether the file information qualifies as bad
457 
458  We match against the list of bad data identifiers.
459  """
460  if not badIdList:
461  return False
462  for badId in badIdList:
463  if all(info[key] == value for key, value in badId.items()):
464  return True
465  return False
466 
467  def run(self, args):
468  """Ingest all specified files and add them to the registry"""
469  filenameList = sum([glob(filename) for filename in args.files], [])
470  root = args.input
471  context = self.register.openRegistry(root, create=args.create, dryrun=args.dryrun)
472  with context as registry:
473  for infile in filenameList:
474  if self.isBadFile(infile, args.badFile):
475  self.log.info("Skipping declared bad file %s" % infile)
476  continue
477  try:
478  fileInfo, hduInfoList = self.parse.getInfo(infile)
479  except Exception as e:
480  if not self.config.allowError:
481  raise
482  self.log.warn("Error parsing %s (%s); skipping" % (infile, e))
483  continue
484  if self.isBadId(fileInfo, args.badId.idList):
485  self.log.info("Skipping declared bad file %s: %s" % (infile, fileInfo))
486  continue
487  if self.register.check(registry, fileInfo):
488  self.log.warn("%s: already ingested: %s" % (infile, fileInfo))
489  outfile = self.parse.getDestination(args.butler, fileInfo, infile)
490  ingested = self.ingest(infile, outfile, mode=args.mode, dryrun=args.dryrun)
491  if not ingested:
492  continue
493  for info in hduInfoList:
494  self.register.addRow(registry, info, dryrun=args.dryrun, create=args.create)
495  self.register.addVisits(registry, dryrun=args.dryrun)
496 
497 
498 def assertCanCopy(fromPath, toPath):
499  """Can I copy a file? Raise an exception is space constraints not met.
500 
501  @param fromPath Path from which the file is being copied
502  @param toPath Path to which the file is being copied
503  """
504  req = os.stat(fromPath).st_size
505  st = os.statvfs(os.path.dirname(toPath))
506  avail = st.f_bavail * st.f_frsize
507  if avail < req:
508  raise RuntimeError("Insufficient space: %d vs %d" % (req, avail))