lsst.obs.base  16.0-15-gd0383af
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "VisitInfoRawIngestTask")
24 
25 import os.path
26 from abc import ABCMeta, abstractmethod
27 
28 from lsst.afw.image import readMetadata
29 from lsst.daf.butler import DatasetType, StorageClassFactory, Run
30 from lsst.daf.butler.instrument import makeExposureEntryFromVisitInfo, makeVisitEntryFromVisitInfo
31 from lsst.pex.config import Config, Field, ChoiceField
32 from lsst.pipe.base import Task
33 
34 
35 class IngestConflictError(RuntimeError):
36  pass
37 
38 
39 class RawIngestConfig(Config):
40  transfer = ChoiceField(
41  ("How to transfer files (None for no transfer)."),
42  dtype=str,
43  allowed={"move": "move",
44  "copy": "copy",
45  "hardlink": "hard link",
46  "symlink": "symbolic (soft) link"},
47  optional=True,
48  )
49  conflict = ChoiceField(
50  ("What to do if a raw Dataset with the same data ID as an "
51  "ingested file already exists in the Butler's Collection."),
52  dtype=str,
53  allowed={"ignore": ("Do not add the new file to the Collection. If "
54  "'stash' is not None, the new file will be "
55  "ingested into the stash Collection instead."),
56  "fail": ("Raise RuntimeError if a conflict is encountered "
57  "(which may then be caught if onError == 'continue')."),
58  },
59  optional=False,
60  default="ignore",
61  )
62  stash = Field(
63  "Name of an alternate Collection to hold Datasets that lose conflicts.",
64  dtype=str,
65  default=None,
66  )
67  onError = ChoiceField(
68  "What to do if an error (including fatal conflicts) occurs.",
69  dtype=str,
70  allowed={"continue": "Warn and continue with the next file.",
71  "break": ("Stop processing immediately, but leave "
72  "already-ingested datasets in the repository."),
73  "rollback": ("Stop processing and attempt to remove aleady-"
74  "ingested datasets from the repository."),
75  },
76  optional=False,
77  default="continue",
78  )
79 
80 
81 class RawIngestTask(Task, metaclass=ABCMeta):
82  """Driver Task for ingesting raw data into Gen3 Butler repositories.
83 
84  This Task is intended to be runnable from the command-line, but it doesn't
85  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
86  gain much from being one. It also wouldn't really be appropriate as a
87  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
88  leverage the logging and configurability functionality that provides.
89 
90  Each instance of `RawIngestTask` writes to the same Butler and maintains a
91  cache of DataUnit entries that have already been added to or extracted
92  from its Registry. Each invocation of `RawIngestTask.run` ingests a list
93  of files (possibly semi-atomically; see `RawIngestConfig.onError`).
94 
95  RawIngestTask should be subclassed to specialize ingest for the actual
96  structure of raw data files produced by a particular camera. Subclasses
97  must either provide populated `MetadataReader` instances in the
98  `dataIdReader`, `visitReader`, and `exposureReader` class attributes, or
99  alternate implementations of the `extractDataId`, `extractVisit`, and
100  `extractExposure` methods that do not use those attributes (each
101  attribute-method pair may be handled differently). Subclasses may also
102  wish to override `getFormatter` and/or (rarely) `getDatasetType`. We do
103  not anticipate overriding `run`, `ensureDataUnits`, `ingestFile`, or
104  `processFile` to ever be necessary.
105 
106  Parameters
107  ----------
108  config : `RawIngestConfig`
109  Configuration for whether/how to transfer files and how to handle
110  conflicts and errors.
111  butler : `~lsst.daf.butler.Butler`
112  Butler instance. Ingested Datasets will be created as part of
113  ``butler.run`` and associated with its Collection.
114 
115  Other keyword arguments are forwarded to the Task base class constructor.
116  """
117 
118  ConfigClass = RawIngestConfig
119 
120  _DefaultName = "ingest"
121 
122  @classmethod
123  def getDatasetType(cls):
124  """Return the DatasetType of the Datasets ingested by this Task.
125  """
126  return DatasetType("raw", ("Camera", "Sensor", "Exposure"),
127  StorageClassFactory().getStorageClass("Exposure"))
128 
129  def __init__(self, config=None, *, butler, **kwds):
130  super().__init__(config, **kwds)
131  self.butler = butler
133  self.units = tuple(butler.registry.getDataUnitDefinition(k)
134  for k in ("Camera", "Sensor", "PhysicalFilter", "Visit", "Exposure", ))
135  # Nested dictionary of form {<unit-name>: {<primary-key-tuple>: {<field>: <value>}}}, where:
136  # - <unit-name> is a DataUnit name (e.g. Camera, Exposure)
137  # - <primary-key-tuple> is a tuple of values that correspond to the [compound] primary
138  # key for that DataUnit. (TODO: make these DataId objects on DM-15034).
139  # - <field> is the name of a column in the table for this DataUnit.
140  # - <value> is the value of that field.
141  # The {<field>: <value>} dict is called an "entry" in this class and in Registry methods.
142  self.unitEntryCache = {k.name: {} for k in self.units}
143  # (Possibly) create a Run object for the "stash": where we put datasets
144  # that lose conflicts. Note that this doesn't actually add this Run
145  # to the Registry; we only do that on first use.
146  self.stashRun = Run(self.config.stash) if self.config.stash is not None else None
147 
148  def run(self, files):
149  """Ingest files into a Butler data repository.
150 
151  This creates any new Exposure or Visit DataUnit entries needed to
152  identify the ingested files, creates new Dataset entries in the
153  Registry and finally ingests the files themselves into the Datastore.
154  Any needed Camera, Sensor, and PhysicalFilter DataUnit entries must
155  exist in the Registry before `run` is called.
156 
157  Parameters
158  ----------
159  files : iterable over `str` or path-like objects
160  Paths to the files to be ingested. Will be made absolute
161  if they are not already.
162  """
163  self.butler.registry.registerDatasetType(self.getDatasetType())
164  if self.config.onError == "rollback":
165  with self.butler.transaction():
166  for file in files:
167  self.processFile(os.path.abspath(file))
168  elif self.config.onError == "break":
169  for file in files:
170  self.processFile(os.path.abspath(file))
171  elif self.config.onError == "continue":
172  for file in files:
173  try:
174  self.processFile(os.path.abspath(file))
175  except Exception as err:
176  self.log.warnf("Error processing '{}': {}", file, err)
177 
178  def readHeaders(self, file):
179  """Read and return any relevant headers from the given file.
180 
181  The default implementation simply reads the header of the first
182  non-empty HDU, so it always returns a single-element list.
183 
184  Parameters
185  ----------
186  file : `str` or path-like object
187  Absolute path to the file to be ingested.
188 
189  Returns
190  -------
191  headers : `list` of `~lsst.daf.base.PropertyList`
192  Single-element list containing the header of the first
193  non-empty HDU.
194  """
195  return [readMetadata(file)]
196 
197  def ensureDataUnits(self, file):
198  """Extract metadata from a raw file and add Exposure and Visit
199  DataUnit entries.
200 
201  Any needed Camera, Sensor, and PhysicalFilter DataUnit entries must
202  exist in the Registry before `run` is called.
203 
204  Parameters
205  ----------
206  file : `str` or path-like object
207  Absolute path to the file to be ingested.
208 
209  Returns
210  -------
211  headers : `list` of `~lsst.daf.base.PropertyList`
212  Result of calling `readHeaders`.
213  dataId : `dict`
214  Data ID dictionary, as returned by `extractDataId`.
215  """
216  headers = self.readHeaders(file)
217 
218  # Extract a dictionary with structure {<link-name>: <value>} where:
219  # - <link-name> is the name of a DataUnit link to the Dataset table,
220  # usually a DataUnit primary key field (e.g. 'camera' or 'visit').
221  # - <value> is the value of that field
222  dataId = self.extractDataId(file, headers)
223  dataId.setdefault("physical_filter", None)
224  dataId.setdefault("visit", None)
225 
226  # Locate or extract additional DataUnit metadata, producing a nested
227  # dict with structure {<unit-name>: {<field>: <value>}}. This is the
228  # same content as self.unitEntryCache, but without the middle layer,
229  # because this contains only the entries associated with this
230  # particular file.
231  associatedUnitEntries = {}
232  for unit in self.units:
233  # Start by looking in the Task's cache of unit entries, which is keyed by a tuple.
234  unitPrimaryKeyTuple = tuple(dataId[f] for f in unit.primaryKey)
235  if any(v is None for v in unitPrimaryKeyTuple):
236  # This DataUnit isn't actually applicable for this file; move
237  # on. Could be a calibration Exposure that doesn't have a
238  # Visit, for example.
239  associatedUnitEntries[unit.name] = None
240  continue
241  unitEntryDict = self.unitEntryCache[unit.name].get(unitPrimaryKeyTuple, None)
242  if unitEntryDict is None:
243  # Next look in the Registry, which is keyed by a dataId-like dict
244  unitPrimaryKeyDict = {f: dataId[f] for f in unit.primaryKey}
245  unitEntryDict = self.butler.registry.findDataUnitEntry(unit.name, unitPrimaryKeyDict)
246  if unitEntryDict is None:
247  # If we haven't found it, either raise an exception or extract that information
248  # from the headers (and possibly the filename).
249  if unit.name == "Visit":
250  extractMethod = self.extractVisitEntry
251  elif unit.name == "Exposure":
252  extractMethod = self.extractExposureEntry
253  else:
254  raise LookupError("{} with keys {} not found; must be present in Registry prior "
255  "to ingest.".format(unit.name, unitPrimaryKeyDict))
256  unitEntryDict = extractMethod(file, headers, dataId=dataId.copy(),
257  associated=associatedUnitEntries)
258  # Add the entry into the Registry.
259  self.butler.registry.addDataUnitEntry(unit.name, unitEntryDict)
260  # Add the entry into the cache.
261  self.unitEntryCache[unit.name][unitPrimaryKeyTuple] = unitEntryDict
262  associatedUnitEntries[unit.name] = unitEntryDict
263 
264  return headers, dataId
265 
266  def ingestFile(self, file, headers, dataId, run=None):
267  """Ingest a single raw file into the repository.
268 
269  All necessary DataUnit entres must already be present.
270 
271  This method is not transactional; it must be wrapped in a
272  ``with self.butler.transaction` block to make per-file ingest
273  atomic.
274 
275  Parameters
276  ----------
277  file : `str` or path-like object
278  Absolute path to the file to be ingested.
279  headers : `list` of `~lsst.daf.base.PropertyList`
280  Result of calling `readHeaders`.
281  dataId : `dict`
282  Data ID dictionary, as returned by `extractDataId`.
283  run : `~lsst.daf.butler.Run`, optional
284  Run to add the Dataset to; defaults to ``self.butler.run``.
285  """
286  if run is None:
287  run = self.butler.run
288 
289  # Add a Dataset entry to the Registry.
290  try:
291  # We use transactional=False here (a kwarg added by the
292  # @transactional decorator) to keep the conflict exception from
293  # starting a higher-level rollback - if we catch this exception,
294  # we don't want to have already started rolling back the ingest of
295  # *previous* files when config.onError=='rollback' but
296  # config.confict=='ignore'.
297  ref = self.butler.registry.addDataset(self.datasetType, dataId, run=run,
298  transactional=False, recursive=True)
299  except ValueError:
300  raise IngestConflictError("Ingest conflict on {} {}".format(file, dataId))
301 
302  # Ingest it into the Datastore.
303  self.butler.datastore.ingest(file, ref, formatter=self.getFormatter(file, headers, dataId),
304  transfer=self.config.transfer)
305  return None
306 
307  def processFile(self, file):
308  """Ingest a single raw data file after extacting metadata.
309 
310  This creates any new Exposure or Visit DataUnit entries needed to
311  identify the ingest file, creates a new Dataset entry in the
312  Registry and finally ingests the file itself into the Datastore.
313  Any needed Camera, Sensor, and PhysicalFilter DataUnit entries must
314  exist in the Registry before `run` is called.
315 
316  Parameters
317  ----------
318  file : `str` or path-like object
319  Absolute path to the file to be ingested.
320  """
321  headers, dataId = self.ensureDataUnits(file)
322  # We want ingesting a single file to be atomic even if we are
323  # not trying to ingest the list of files atomically.
324  with self.butler.transaction():
325  try:
326  self.ingestFile(file, headers, dataId)
327  return
328  except IngestConflictError:
329  if self.config.conflict == "fail":
330  raise
331  if self.config.conflict == "ignore":
332  if self.stashRun is not None:
333  if self.stashRun.id is None:
334  self.butler.registry.ensureRun(self.stashRun)
335  self.log.infof("Conflict on {} ({}); ingesting to stash '{}' instead.",
336  dataId, file, self.config.stash)
337  with self.butler.transaction():
338  self.ingestFile(file, headers, dataId, run=self.stashRun)
339  else:
340  self.log.infof("Conflict on {} ({}); ignoring.", dataId, file)
341 
342  @abstractmethod
343  def extractDataId(self, file, headers):
344  """Return the Data ID dictionary that should be used to label a file.
345 
346  Parameters
347  ----------
348  file : `str` or path-like object
349  Absolute path to the file being ingested (prior to any transfers).
350  headers : `list` of `~lsst.daf.base.PropertyList`
351  All headers returned by `readHeaders()`.
352 
353  Returns
354  -------
355  dataId : `dict`
356  Must include "camera", "sensor", and "exposure" keys. If the
357  Exposure is associated with a PhysicalFilter and/or Visit,
358  "physical_filter" and "visit" keys should be provided as well
359  (respectively).
360  """
361  raise NotImplementedError("Must be implemented by subclasses.")
362 
363  @abstractmethod
364  def extractVisitEntry(self, file, headers, dataId, associated):
365  """Create a Visit DataUnit entry from raw file metadata.
366 
367  Parameters
368  ----------
369  file : `str` or path-like object
370  Absolute path to the file being ingested (prior to any transfers).
371  headers : `list` of `~lsst.daf.base.PropertyList`
372  All headers returned by `readHeaders()`.
373  dataId : `dict`
374  The data ID for this file. Implementations are permitted to
375  modify this dictionary (generally by stripping off "sensor" and
376  "exposure" and adding new metadata key-value pairs) and return it.
377  associated : `dict`
378  A dictionary containing other associated DataUnit entries.
379  Guaranteed to have "Camera", "Sensor", and "PhysicalFilter" keys,
380  but the last may map to ``None`` if `extractDataId` either did not
381  contain a "physical_filter" key or mapped it to ``None``.
382  Subclasses may add new keys to this dict to pass arbitrary data to
383  `extractExposureEntry` (`extractVisitEntry` is always called
384  first), but note that when a Visit is comprised of multiple
385  Exposures, `extractVisitEntry` may not be called at all.
386 
387  Returns
388  -------
389  entry : `dict`
390  Dictionary corresponding to an Visit database table row.
391  Must have all non-null columns in the Visit table as keys.
392  """
393  raise NotImplementedError("Must be implemented by subclasses.")
394 
395  @abstractmethod
396  def extractExposureEntry(self, file, headers, dataId, associated):
397  """Create an Exposure DataUnit entry from raw file metadata.
398 
399  Parameters
400  ----------
401  file : `str` or path-like object
402  Absolute path to the file being ingested (prior to any transfers).
403  headers : `list` of `~lsst.daf.base.PropertyList`
404  All headers returned by `readHeaders()`.
405  dataId : `dict`
406  The data ID for this file. Implementations are permitted to
407  modify this dictionary (generally by stripping off "sensor" and
408  adding new metadata key-value pairs) and return it.
409  associated : `dict`
410  A dictionary containing other associated DataUnit entries.
411  Guaranteed to have "Camera", "Sensor", "PhysicalFilter", and
412  "Visit" keys, but the latter two may map to ``None`` if
413  `extractDataId` did not contain keys for these or mapped them to
414  ``None``. May also contain additional keys added by
415  `extractVisitEntry`.
416 
417  Returns
418  -------
419  entry : `dict`
420  Dictionary corresponding to an Exposure database table row.
421  Must have all non-null columns in the Exposure table as keys.
422  """
423  raise NotImplementedError("Must be implemented by subclasses.")
424 
425  def getFormatter(self, file, headers, dataId):
426  """Return the Formatter that should be used to read this file after
427  ingestion.
428 
429  The default implementation returns None, which uses the formatter
430  configured for this DatasetType/StorageClass in the Butler.
431  """
432  return None
433 
434 
436  """An intermediate base class of RawIngestTask for cameras that already
437  implement constructing a `afw.image.VisitInfo` object from raw data.
438 
439  Subclasses must provide (at least) implementations of `extractDataId` and
440  the new `makeVisitInfo` method; the latter is used to provide concrete
441  implementations of `extractVisitEntry` and `extractExposureEntry`.
442  """
443 
444  @abstractmethod
445  def makeVisitInfo(self, headers, exposureId):
446  """Return an `afw.image.VisitInfo` object from the given header and ID.
447 
448  Parameters
449  ----------
450  headers : `list` of `~lsst.daf.base.PropertyList`
451  All headers returned by `readHeaders()`.
452  exposureId : `int`
453  Integer ID to pass to the `VisitInfo` constructor.
454  """
455  raise NotImplementedError("Must be implemented by subclasses.")
456 
457  def extractVisitEntry(self, file, headers, dataId, associated):
458  """Create a Visit DataUnit entry from raw file metadata.
459 
460  Parameters
461  ----------
462  file : `str` or path-like object
463  Absolute path to the file being ingested (prior to any transfers).
464  headers : `list` of `~lsst.daf.base.PropertyList`
465  All headers returned by `readHeaders()`.
466  dataId : `dict`
467  The data ID for this file. Implementations are permitted to
468  modify this dictionary (generally by stripping off "sensor" and
469  "exposure" and adding new metadata key-value pairs) and return it.
470  associated : `dict`
471  A dictionary containing other associated DataUnit entries.
472  Guaranteed to have "Camera", "Sensor", and "PhysicalFilter" keys,
473  but the last may map to ``None`` if `extractDataId` either did not
474  contain a "physical_filter" key or mapped it to ``None``.
475  Also adds a "VisitInfo" key containing an `afw.image.VisitInfo`
476  object for use by `extractExposureEntry`.
477 
478  Returns
479  -------
480  entry : `dict`
481  Dictionary corresponding to an Visit database table row.
482  Must have all non-null columns in the Visit table as keys.
483  """
484  visitInfo = self.makeVisitInfo(headers, exposureId=dataId["exposure"])
485  associated["VisitInfo"] = visitInfo
486  del dataId["sensor"]
487  del dataId["exposure"]
488  return makeVisitEntryFromVisitInfo(dataId, visitInfo)
489 
490  def extractExposureEntry(self, file, headers, dataId, associated):
491  """Create an Exposure DataUnit entry from raw file metadata.
492 
493  Parameters
494  ----------
495  file : `str` or path-like object
496  Absolute path to the file being ingested (prior to any transfers).
497  headers : `list` of `~lsst.daf.base.PropertyList`
498  All headers returned by `readHeaders()`.
499  dataId : `dict`
500  The data ID for this file. Implementations are permitted to
501  modify this dictionary (generally by stripping off "sensor" and
502  adding new metadata key-value pairs) and return it.
503  associated : `dict`
504  A dictionary containing other associated DataUnit entries.
505  Guaranteed to have "Camera", "Sensor", "PhysicalFilter", and
506  "Visit" keys, but the latter two may map to ``None`` if
507  `extractDataId` did not contain keys for these or mapped them to
508  ``None``. May also contain additional keys added by
509  `extractVisitEntry`.
510 
511  Returns
512  -------
513  entry : `dict`
514  Dictionary corresponding to an Exposure database table row.
515  Must have all non-null columns in the Exposure table as keys.
516  """
517  try:
518  visitInfo = associated["VisitInfo"]
519  except KeyError:
520  visitInfo = self.makeVisitInfo(headers, exposureId=dataId["exposure"])
521  del dataId["sensor"]
522  return makeExposureEntryFromVisitInfo(dataId, visitInfo)
def extractExposureEntry(self, file, headers, dataId, associated)
Definition: ingest.py:396
def extractDataId(self, file, headers)
Definition: ingest.py:343
def __init__(self, config=None, butler, kwds)
Definition: ingest.py:129
def extractVisitEntry(self, file, headers, dataId, associated)
Definition: ingest.py:457
def extractVisitEntry(self, file, headers, dataId, associated)
Definition: ingest.py:364
def makeVisitInfo(self, headers, exposureId)
Definition: ingest.py:445
def ingestFile(self, file, headers, dataId, run=None)
Definition: ingest.py:266
def extractExposureEntry(self, file, headers, dataId, associated)
Definition: ingest.py:490
def getFormatter(self, file, headers, dataId)
Definition: ingest.py:425