lsst.obs.base  16.0-19-g302af01
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig")
24 
25 import os.path
26 from abc import ABCMeta
27 
28 from astro_metadata_translator import ObservationInfo
29 from lsst.afw.image import readMetadata
30 from lsst.daf.butler import DatasetType, StorageClassFactory, Run
31 from lsst.daf.butler.instrument import makeExposureEntryFromObsInfo, makeVisitEntryFromObsInfo
32 from lsst.pex.config import Config, Field, ChoiceField
33 from lsst.pipe.base import Task
34 
35 
36 class IngestConflictError(RuntimeError):
37  pass
38 
39 
40 class RawIngestConfig(Config):
41  transfer = ChoiceField(
42  ("How to transfer files (None for no transfer)."),
43  dtype=str,
44  allowed={"move": "move",
45  "copy": "copy",
46  "hardlink": "hard link",
47  "symlink": "symbolic (soft) link"},
48  optional=True,
49  )
50  conflict = ChoiceField(
51  ("What to do if a raw Dataset with the same data ID as an "
52  "ingested file already exists in the Butler's Collection."),
53  dtype=str,
54  allowed={"ignore": ("Do not add the new file to the Collection. If "
55  "'stash' is not None, the new file will be "
56  "ingested into the stash Collection instead."),
57  "fail": ("Raise RuntimeError if a conflict is encountered "
58  "(which may then be caught if onError == 'continue')."),
59  },
60  optional=False,
61  default="ignore",
62  )
63  stash = Field(
64  "Name of an alternate Collection to hold Datasets that lose conflicts.",
65  dtype=str,
66  default=None,
67  )
68  onError = ChoiceField(
69  "What to do if an error (including fatal conflicts) occurs.",
70  dtype=str,
71  allowed={"continue": "Warn and continue with the next file.",
72  "break": ("Stop processing immediately, but leave "
73  "already-ingested datasets in the repository."),
74  "rollback": ("Stop processing and attempt to remove aleady-"
75  "ingested datasets from the repository."),
76  },
77  optional=False,
78  default="continue",
79  )
80 
81 
82 class RawIngestTask(Task, metaclass=ABCMeta):
83  """Driver Task for ingesting raw data into Gen3 Butler repositories.
84 
85  This Task is intended to be runnable from the command-line, but it doesn't
86  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
87  gain much from being one. It also wouldn't really be appropriate as a
88  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
89  leverage the logging and configurability functionality that provides.
90 
91  Each instance of `RawIngestTask` writes to the same Butler and maintains a
92  cache of DataUnit entries that have already been added to or extracted
93  from its Registry. Each invocation of `RawIngestTask.run` ingests a list
94  of files (possibly semi-atomically; see `RawIngestConfig.onError`).
95 
96  RawIngestTask should be subclassed to specialize ingest for the actual
97  structure of raw data files produced by a particular camera. Subclasses
98  must either provide populated `MetadataReader` instances in the
99  `dataIdReader`, `visitReader`, and `exposureReader` class attributes, or
100  alternate implementations of the `extractDataId`, `extractVisit`, and
101  `extractExposure` methods that do not use those attributes (each
102  attribute-method pair may be handled differently). Subclasses may also
103  wish to override `getFormatter` and/or (rarely) `getDatasetType`. We do
104  not anticipate overriding `run`, `ensureDataUnits`, `ingestFile`, or
105  `processFile` to ever be necessary.
106 
107  Parameters
108  ----------
109  config : `RawIngestConfig`
110  Configuration for whether/how to transfer files and how to handle
111  conflicts and errors.
112  butler : `~lsst.daf.butler.Butler`
113  Butler instance. Ingested Datasets will be created as part of
114  ``butler.run`` and associated with its Collection.
115 
116  Other keyword arguments are forwarded to the Task base class constructor.
117  """
118 
119  ConfigClass = RawIngestConfig
120 
121  _DefaultName = "ingest"
122 
123  @classmethod
124  def getDatasetType(cls):
125  """Return the DatasetType of the Datasets ingested by this Task.
126  """
127  return DatasetType("raw", ("Camera", "Sensor", "Exposure"),
128  StorageClassFactory().getStorageClass("Exposure"))
129 
130  def __init__(self, config=None, *, butler, **kwds):
131  super().__init__(config, **kwds)
132  self.butler = butler
134  self.units = tuple(butler.registry.getDataUnitDefinition(k)
135  for k in ("Camera", "Sensor", "PhysicalFilter", "Visit", "Exposure", ))
136  # Nested dictionary of form {<unit-name>: {<primary-key-tuple>: {<field>: <value>}}}, where:
137  # - <unit-name> is a DataUnit name (e.g. Camera, Exposure)
138  # - <primary-key-tuple> is a tuple of values that correspond to the [compound] primary
139  # key for that DataUnit. (TODO: make these DataId objects on DM-15034).
140  # - <field> is the name of a column in the table for this DataUnit.
141  # - <value> is the value of that field.
142  # The {<field>: <value>} dict is called an "entry" in this class and in Registry methods.
143  self.unitEntryCache = {k.name: {} for k in self.units}
144  # (Possibly) create a Run object for the "stash": where we put datasets
145  # that lose conflicts. Note that this doesn't actually add this Run
146  # to the Registry; we only do that on first use.
147  self.stashRun = Run(self.config.stash) if self.config.stash is not None else None
148 
149  def run(self, files):
150  """Ingest files into a Butler data repository.
151 
152  This creates any new Exposure or Visit DataUnit entries needed to
153  identify the ingested files, creates new Dataset entries in the
154  Registry and finally ingests the files themselves into the Datastore.
155  Any needed Camera, Sensor, and PhysicalFilter DataUnit entries must
156  exist in the Registry before `run` is called.
157 
158  Parameters
159  ----------
160  files : iterable over `str` or path-like objects
161  Paths to the files to be ingested. Will be made absolute
162  if they are not already.
163  """
164  self.butler.registry.registerDatasetType(self.getDatasetType())
165  if self.config.onError == "rollback":
166  with self.butler.transaction():
167  for file in files:
168  self.processFile(os.path.abspath(file))
169  elif self.config.onError == "break":
170  for file in files:
171  self.processFile(os.path.abspath(file))
172  elif self.config.onError == "continue":
173  for file in files:
174  try:
175  self.processFile(os.path.abspath(file))
176  except Exception as err:
177  self.log.warnf("Error processing '{}': {}", file, err)
178 
179  def readHeaders(self, file):
180  """Read and return any relevant headers from the given file.
181 
182  The default implementation simply reads the header of the first
183  non-empty HDU, so it always returns a single-element list.
184 
185  Parameters
186  ----------
187  file : `str` or path-like object
188  Absolute path to the file to be ingested.
189 
190  Returns
191  -------
192  headers : `list` of `~lsst.daf.base.PropertyList`
193  Single-element list containing the header of the first
194  non-empty HDU.
195  """
196  return [readMetadata(file)]
197 
198  def ensureDataUnits(self, file):
199  """Extract metadata from a raw file and add Exposure and Visit
200  DataUnit entries.
201 
202  Any needed Camera, Sensor, and PhysicalFilter DataUnit entries must
203  exist in the Registry before `run` is called.
204 
205  Parameters
206  ----------
207  file : `str` or path-like object
208  Absolute path to the file to be ingested.
209 
210  Returns
211  -------
212  headers : `list` of `~lsst.daf.base.PropertyList`
213  Result of calling `readHeaders`.
214  dataId : `dict`
215  Data ID dictionary, as returned by `extractDataId`.
216  """
217  headers = self.readHeaders(file)
218 
219  # Extract a dictionary with structure {<link-name>: <value>} where:
220  # - <link-name> is the name of a DataUnit link to the Dataset table,
221  # usually a DataUnit primary key field (e.g. 'camera' or 'visit').
222  # - <value> is the value of that field
223  dataId = self.extractDataId(file, headers)
224  dataId.setdefault("physical_filter", None)
225  dataId.setdefault("visit", None)
226 
227  # Locate or extract additional DataUnit metadata, producing a nested
228  # dict with structure {<unit-name>: {<field>: <value>}}. This is the
229  # same content as self.unitEntryCache, but without the middle layer,
230  # because this contains only the entries associated with this
231  # particular file.
232  associatedUnitEntries = {}
233  for unit in self.units:
234  # Start by looking in the Task's cache of unit entries, which is keyed by a tuple.
235  unitPrimaryKeyTuple = tuple(dataId[f] for f in unit.primaryKey)
236  if any(v is None for v in unitPrimaryKeyTuple):
237  # This DataUnit isn't actually applicable for this file; move
238  # on. Could be a calibration Exposure that doesn't have a
239  # Visit, for example.
240  associatedUnitEntries[unit.name] = None
241  continue
242  unitEntryDict = self.unitEntryCache[unit.name].get(unitPrimaryKeyTuple, None)
243  if unitEntryDict is None:
244  # Next look in the Registry, which is keyed by a dataId-like dict
245  unitPrimaryKeyDict = {f: dataId[f] for f in unit.primaryKey}
246  unitEntryDict = self.butler.registry.findDataUnitEntry(unit.name, unitPrimaryKeyDict)
247  if unitEntryDict is None:
248  # If we haven't found it, either raise an exception or extract that information
249  # from the headers (and possibly the filename).
250  if unit.name == "Visit":
251  extractMethod = self.extractVisitEntry
252  elif unit.name == "Exposure":
253  extractMethod = self.extractExposureEntry
254  else:
255  raise LookupError("{} with keys {} not found; must be present in Registry prior "
256  "to ingest.".format(unit.name, unitPrimaryKeyDict))
257  unitEntryDict = extractMethod(file, headers, dataId=dataId.copy(),
258  associated=associatedUnitEntries)
259  # Add the entry into the Registry.
260  self.butler.registry.addDataUnitEntry(unit.name, unitEntryDict)
261  # Add the entry into the cache.
262  self.unitEntryCache[unit.name][unitPrimaryKeyTuple] = unitEntryDict
263  associatedUnitEntries[unit.name] = unitEntryDict
264 
265  return headers, dataId
266 
267  def ingestFile(self, file, headers, dataId, run=None):
268  """Ingest a single raw file into the repository.
269 
270  All necessary DataUnit entres must already be present.
271 
272  This method is not transactional; it must be wrapped in a
273  ``with self.butler.transaction` block to make per-file ingest
274  atomic.
275 
276  Parameters
277  ----------
278  file : `str` or path-like object
279  Absolute path to the file to be ingested.
280  headers : `list` of `~lsst.daf.base.PropertyList`
281  Result of calling `readHeaders`.
282  dataId : `dict`
283  Data ID dictionary, as returned by `extractDataId`.
284  run : `~lsst.daf.butler.Run`, optional
285  Run to add the Dataset to; defaults to ``self.butler.run``.
286  """
287  if run is None:
288  run = self.butler.run
289 
290  # Add a Dataset entry to the Registry.
291  try:
292  # We use transactional=False here (a kwarg added by the
293  # @transactional decorator) to keep the conflict exception from
294  # starting a higher-level rollback - if we catch this exception,
295  # we don't want to have already started rolling back the ingest of
296  # *previous* files when config.onError=='rollback' but
297  # config.confict=='ignore'.
298  ref = self.butler.registry.addDataset(self.datasetType, dataId, run=run,
299  transactional=False, recursive=True)
300  except ValueError:
301  raise IngestConflictError("Ingest conflict on {} {}".format(file, dataId))
302 
303  # Ingest it into the Datastore.
304  self.butler.datastore.ingest(file, ref, formatter=self.getFormatter(file, headers, dataId),
305  transfer=self.config.transfer)
306  return None
307 
308  def processFile(self, file):
309  """Ingest a single raw data file after extacting metadata.
310 
311  This creates any new Exposure or Visit DataUnit entries needed to
312  identify the ingest file, creates a new Dataset entry in the
313  Registry and finally ingests the file itself into the Datastore.
314  Any needed Camera, Sensor, and PhysicalFilter DataUnit entries must
315  exist in the Registry before `run` is called.
316 
317  Parameters
318  ----------
319  file : `str` or path-like object
320  Absolute path to the file to be ingested.
321  """
322  headers, dataId = self.ensureDataUnits(file)
323  # We want ingesting a single file to be atomic even if we are
324  # not trying to ingest the list of files atomically.
325  with self.butler.transaction():
326  try:
327  self.ingestFile(file, headers, dataId)
328  return
329  except IngestConflictError:
330  if self.config.conflict == "fail":
331  raise
332  if self.config.conflict == "ignore":
333  if self.stashRun is not None:
334  if self.stashRun.id is None:
335  self.butler.registry.ensureRun(self.stashRun)
336  self.log.infof("Conflict on {} ({}); ingesting to stash '{}' instead.",
337  dataId, file, self.config.stash)
338  with self.butler.transaction():
339  self.ingestFile(file, headers, dataId, run=self.stashRun)
340  else:
341  self.log.infof("Conflict on {} ({}); ignoring.", dataId, file)
342 
343  def extractDataId(self, file, headers):
344  """Return the Data ID dictionary that should be used to label a file.
345 
346  Parameters
347  ----------
348  file : `str` or path-like object
349  Absolute path to the file being ingested (prior to any transfers).
350  headers : `list` of `~lsst.daf.base.PropertyList`
351  All headers returned by `readHeaders()`.
352 
353  Returns
354  -------
355  dataId : `dict`
356  Must include "camera", "sensor", and "exposure" keys. If the
357  Exposure is associated with a PhysicalFilter and/or Visit,
358  "physical_filter" and "visit" keys should be provided as well
359  (respectively).
360  """
361  obsInfo = ObservationInfo(headers[0])
362  return {
363  "camera": obsInfo.instrument,
364  "exposure": obsInfo.exposure_id,
365  "visit": obsInfo.visit_id,
366  "sensor": obsInfo.detector_num,
367  "physical_filter": obsInfo.physical_filter,
368  }
369 
370  def extractVisitEntry(self, file, headers, dataId, associated):
371  """Create a Visit DataUnit entry from raw file metadata.
372 
373  Parameters
374  ----------
375  file : `str` or path-like object
376  Absolute path to the file being ingested (prior to any transfers).
377  headers : `list` of `~lsst.daf.base.PropertyList`
378  All headers returned by `readHeaders()`.
379  dataId : `dict`
380  The data ID for this file. Implementations are permitted to
381  modify this dictionary (generally by stripping off "sensor" and
382  "exposure" and adding new metadata key-value pairs) and return it.
383  associated : `dict`
384  A dictionary containing other associated DataUnit entries.
385  Guaranteed to have "Camera", "Sensor", and "PhysicalFilter" keys,
386  but the last may map to ``None`` if `extractDataId` either did not
387  contain a "physical_filter" key or mapped it to ``None``.
388  Also adds a "VisitInfo" key containing an `afw.image.VisitInfo`
389  object for use by `extractExposureEntry`.
390 
391  Returns
392  -------
393  entry : `dict`
394  Dictionary corresponding to an Visit database table row.
395  Must have all non-null columns in the Visit table as keys.
396  """
397  obsInfo = ObservationInfo(headers[0])
398  associated["ObsInfo"] = obsInfo
399  del dataId["sensor"]
400  del dataId["exposure"]
401  return makeVisitEntryFromObsInfo(dataId, obsInfo)
402 
403  def extractExposureEntry(self, file, headers, dataId, associated):
404  """Create an Exposure DataUnit entry from raw file metadata.
405 
406  Parameters
407  ----------
408  file : `str` or path-like object
409  Absolute path to the file being ingested (prior to any transfers).
410  headers : `list` of `~lsst.daf.base.PropertyList`
411  All headers returned by `readHeaders()`.
412  dataId : `dict`
413  The data ID for this file. Implementations are permitted to
414  modify this dictionary (generally by stripping off "sensor" and
415  adding new metadata key-value pairs) and return it.
416  associated : `dict`
417  A dictionary containing other associated DataUnit entries.
418  Guaranteed to have "Camera", "Sensor", "PhysicalFilter", and
419  "Visit" keys, but the latter two may map to ``None`` if
420  `extractDataId` did not contain keys for these or mapped them to
421  ``None``. May also contain additional keys added by
422  `extractVisitEntry`.
423 
424  Returns
425  -------
426  entry : `dict`
427  Dictionary corresponding to an Exposure database table row.
428  Must have all non-null columns in the Exposure table as keys.
429  """
430  try:
431  obsInfo = associated["ObsInfo"]
432  except KeyError:
433  obsInfo = ObservationInfo(headers[0])
434  del dataId["sensor"]
435  return makeExposureEntryFromObsInfo(dataId, obsInfo)
436 
437  def getFormatter(self, file, headers, dataId):
438  """Return the Formatter that should be used to read this file after
439  ingestion.
440 
441  The default implementation returns None, which uses the formatter
442  configured for this DatasetType/StorageClass in the Butler.
443  """
444  return None
def extractExposureEntry(self, file, headers, dataId, associated)
Definition: ingest.py:403
def extractDataId(self, file, headers)
Definition: ingest.py:343
def __init__(self, config=None, butler, kwds)
Definition: ingest.py:130
def extractVisitEntry(self, file, headers, dataId, associated)
Definition: ingest.py:370
def ingestFile(self, file, headers, dataId, run=None)
Definition: ingest.py:267
def getFormatter(self, file, headers, dataId)
Definition: ingest.py:437