lsst.obs.base  16.0-24-gc1c7f52+9
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig")
24 
25 import os.path
26 from abc import ABCMeta
27 
28 from astro_metadata_translator import ObservationInfo
29 from lsst.afw.image import readMetadata
30 from lsst.daf.butler import DatasetType, StorageClassFactory, Run
31 from lsst.daf.butler.instrument import makeExposureEntryFromObsInfo, makeVisitEntryFromObsInfo
32 from lsst.pex.config import Config, Field, ChoiceField
33 from lsst.pipe.base import Task
34 
35 
36 class IngestConflictError(RuntimeError):
37  pass
38 
39 
40 class RawIngestConfig(Config):
41  transfer = ChoiceField(
42  ("How to transfer files (None for no transfer)."),
43  dtype=str,
44  allowed={"move": "move",
45  "copy": "copy",
46  "hardlink": "hard link",
47  "symlink": "symbolic (soft) link"},
48  optional=True,
49  )
50  conflict = ChoiceField(
51  ("What to do if a raw Dataset with the same data ID as an "
52  "ingested file already exists in the Butler's Collection."),
53  dtype=str,
54  allowed={"ignore": ("Do not add the new file to the Collection. If "
55  "'stash' is not None, the new file will be "
56  "ingested into the stash Collection instead."),
57  "fail": ("Raise RuntimeError if a conflict is encountered "
58  "(which may then be caught if onError == 'continue')."),
59  },
60  optional=False,
61  default="ignore",
62  )
63  stash = Field(
64  "Name of an alternate Collection to hold Datasets that lose conflicts.",
65  dtype=str,
66  default=None,
67  )
68  onError = ChoiceField(
69  "What to do if an error (including fatal conflicts) occurs.",
70  dtype=str,
71  allowed={"continue": "Warn and continue with the next file.",
72  "break": ("Stop processing immediately, but leave "
73  "already-ingested datasets in the repository."),
74  "rollback": ("Stop processing and attempt to remove aleady-"
75  "ingested datasets from the repository."),
76  },
77  optional=False,
78  default="continue",
79  )
80 
81 
82 class RawIngestTask(Task, metaclass=ABCMeta):
83  """Driver Task for ingesting raw data into Gen3 Butler repositories.
84 
85  This Task is intended to be runnable from the command-line, but it doesn't
86  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
87  gain much from being one. It also wouldn't really be appropriate as a
88  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
89  leverage the logging and configurability functionality that provides.
90 
91  Each instance of `RawIngestTask` writes to the same Butler and maintains a
92  cache of DataUnit entries that have already been added to or extracted
93  from its Registry. Each invocation of `RawIngestTask.run` ingests a list
94  of files (possibly semi-atomically; see `RawIngestConfig.onError`).
95 
96  RawIngestTask should be subclassed to specialize ingest for the actual
97  structure of raw data files produced by a particular instrument.
98  Subclasses must either provide populated `MetadataReader` instances in the
99  `dataIdReader`, `visitReader`, and `exposureReader` class attributes, or
100  alternate implementations of the `extractDataId`, `extractVisit`, and
101  `extractExposure` methods that do not use those attributes (each
102  attribute-method pair may be handled differently). Subclasses may also
103  wish to override `getFormatter` and/or (rarely) `getDatasetType`. We do
104  not anticipate overriding `run`, `ensureDataUnits`, `ingestFile`, or
105  `processFile` to ever be necessary.
106 
107  Parameters
108  ----------
109  config : `RawIngestConfig`
110  Configuration for whether/how to transfer files and how to handle
111  conflicts and errors.
112  butler : `~lsst.daf.butler.Butler`
113  Butler instance. Ingested Datasets will be created as part of
114  ``butler.run`` and associated with its Collection.
115 
116  Other keyword arguments are forwarded to the Task base class constructor.
117  """
118 
119  ConfigClass = RawIngestConfig
120 
121  _DefaultName = "ingest"
122 
123  @classmethod
124  def getDatasetType(cls):
125  """Return the DatasetType of the Datasets ingested by this Task.
126  """
127  return DatasetType("raw", ("Instrument", "Detector", "Exposure"),
128  StorageClassFactory().getStorageClass("Exposure"))
129 
130  def __init__(self, config=None, *, butler, **kwds):
131  super().__init__(config, **kwds)
132  self.butler = butler
134  self.units = tuple(butler.registry.getDataUnitDefinition(k)
135  for k in ("Instrument", "Detector", "PhysicalFilter", "Visit", "Exposure", ))
136  # Nested dictionary of form {<unit-name>: {<primary-key-tuple>: {<field>: <value>}}}, where:
137  # - <unit-name> is a DataUnit name (e.g. Instrument, Exposure)
138  # - <primary-key-tuple> is a tuple of values that correspond to the [compound] primary
139  # key for that DataUnit. (TODO: make these DataId objects on DM-15034).
140  # - <field> is the name of a column in the table for this DataUnit.
141  # - <value> is the value of that field.
142  # The {<field>: <value>} dict is called an "entry" in this class and in Registry methods.
143  self.unitEntryCache = {k.name: {} for k in self.units}
144  # (Possibly) create a Run object for the "stash": where we put datasets
145  # that lose conflicts. Note that this doesn't actually add this Run
146  # to the Registry; we only do that on first use.
147  self.stashRun = Run(self.config.stash) if self.config.stash is not None else None
148 
149  def run(self, files):
150  """Ingest files into a Butler data repository.
151 
152  This creates any new Exposure or Visit DataUnit entries needed to
153  identify the ingested files, creates new Dataset entries in the
154  Registry and finally ingests the files themselves into the Datastore.
155  Any needed Instrument, Detector, and PhysicalFilter DataUnit entries
156  must exist in the Registry before `run` is called.
157 
158  Parameters
159  ----------
160  files : iterable over `str` or path-like objects
161  Paths to the files to be ingested. Will be made absolute
162  if they are not already.
163  """
164  self.butler.registry.registerDatasetType(self.getDatasetType())
165  if self.config.onError == "rollback":
166  with self.butler.transaction():
167  for file in files:
168  self.processFile(os.path.abspath(file))
169  elif self.config.onError == "break":
170  for file in files:
171  self.processFile(os.path.abspath(file))
172  elif self.config.onError == "continue":
173  for file in files:
174  try:
175  self.processFile(os.path.abspath(file))
176  except Exception as err:
177  self.log.warnf("Error processing '{}': {}", file, err)
178 
179  def readHeaders(self, file):
180  """Read and return any relevant headers from the given file.
181 
182  The default implementation simply reads the header of the first
183  non-empty HDU, so it always returns a single-element list.
184 
185  Parameters
186  ----------
187  file : `str` or path-like object
188  Absolute path to the file to be ingested.
189 
190  Returns
191  -------
192  headers : `list` of `~lsst.daf.base.PropertyList`
193  Single-element list containing the header of the first
194  non-empty HDU.
195  """
196  return [readMetadata(file)]
197 
198  def ensureDataUnits(self, file):
199  """Extract metadata from a raw file and add Exposure and Visit
200  DataUnit entries.
201 
202  Any needed Instrument, Detector, and PhysicalFilter DataUnit entries must
203  exist in the Registry before `run` is called.
204 
205  Parameters
206  ----------
207  file : `str` or path-like object
208  Absolute path to the file to be ingested.
209 
210  Returns
211  -------
212  headers : `list` of `~lsst.daf.base.PropertyList`
213  Result of calling `readHeaders`.
214  dataId : `dict`
215  Data ID dictionary, as returned by `extractDataId`.
216  """
217  headers = self.readHeaders(file)
218 
219  # Extract a dictionary with structure {<link-name>: <value>} where:
220  # - <link-name> is the name of a DataUnit link to the Dataset table,
221  # usually a DataUnit primary key field (e.g. 'instrument' or
222  # 'visit').
223  # - <value> is the value of that field
224  dataId = self.extractDataId(file, headers)
225  dataId.setdefault("physical_filter", None)
226  dataId.setdefault("visit", None)
227 
228  # Locate or extract additional DataUnit metadata, producing a nested
229  # dict with structure {<unit-name>: {<field>: <value>}}. This is the
230  # same content as self.unitEntryCache, but without the middle layer,
231  # because this contains only the entries associated with this
232  # particular file.
233  associatedUnitEntries = {}
234  for unit in self.units:
235  # Start by looking in the Task's cache of unit entries, which is keyed by a tuple.
236  unitPrimaryKeyTuple = tuple(dataId[f] for f in unit.primaryKey)
237  if any(v is None for v in unitPrimaryKeyTuple):
238  # This DataUnit isn't actually applicable for this file; move
239  # on. Could be a calibration Exposure that doesn't have a
240  # Visit, for example.
241  associatedUnitEntries[unit.name] = None
242  continue
243  unitEntryDict = self.unitEntryCache[unit.name].get(unitPrimaryKeyTuple, None)
244  if unitEntryDict is None:
245  # Next look in the Registry, which is keyed by a dataId-like dict
246  unitPrimaryKeyDict = {f: dataId[f] for f in unit.primaryKey}
247  unitEntryDict = self.butler.registry.findDataUnitEntry(unit.name, unitPrimaryKeyDict)
248  if unitEntryDict is None:
249  # If we haven't found it, either raise an exception or extract that information
250  # from the headers (and possibly the filename).
251  if unit.name == "Visit":
252  extractMethod = self.extractVisitEntry
253  elif unit.name == "Exposure":
254  extractMethod = self.extractExposureEntry
255  else:
256  raise LookupError("{} with keys {} not found; must be present in Registry prior "
257  "to ingest.".format(unit.name, unitPrimaryKeyDict))
258  unitEntryDict = extractMethod(file, headers, dataId=dataId.copy(),
259  associated=associatedUnitEntries)
260  # Add the entry into the Registry.
261  self.butler.registry.addDataUnitEntry(unit.name, unitEntryDict)
262  # Add the entry into the cache.
263  self.unitEntryCache[unit.name][unitPrimaryKeyTuple] = unitEntryDict
264  associatedUnitEntries[unit.name] = unitEntryDict
265 
266  return headers, dataId
267 
268  def ingestFile(self, file, headers, dataId, run=None):
269  """Ingest a single raw file into the repository.
270 
271  All necessary DataUnit entres must already be present.
272 
273  This method is not transactional; it must be wrapped in a
274  ``with self.butler.transaction` block to make per-file ingest
275  atomic.
276 
277  Parameters
278  ----------
279  file : `str` or path-like object
280  Absolute path to the file to be ingested.
281  headers : `list` of `~lsst.daf.base.PropertyList`
282  Result of calling `readHeaders`.
283  dataId : `dict`
284  Data ID dictionary, as returned by `extractDataId`.
285  run : `~lsst.daf.butler.Run`, optional
286  Run to add the Dataset to; defaults to ``self.butler.run``.
287  """
288  if run is None:
289  run = self.butler.run
290 
291  # Add a Dataset entry to the Registry.
292  try:
293  # We use transactional=False here (a kwarg added by the
294  # @transactional decorator) to keep the conflict exception from
295  # starting a higher-level rollback - if we catch this exception,
296  # we don't want to have already started rolling back the ingest of
297  # *previous* files when config.onError=='rollback' but
298  # config.confict=='ignore'.
299  ref = self.butler.registry.addDataset(self.datasetType, dataId, run=run,
300  transactional=False, recursive=True)
301  except ValueError:
302  raise IngestConflictError("Ingest conflict on {} {}".format(file, dataId))
303 
304  # Ingest it into the Datastore.
305  self.butler.datastore.ingest(file, ref, formatter=self.getFormatter(file, headers, dataId),
306  transfer=self.config.transfer)
307  return None
308 
309  def processFile(self, file):
310  """Ingest a single raw data file after extacting metadata.
311 
312  This creates any new Exposure or Visit DataUnit entries needed to
313  identify the ingest file, creates a new Dataset entry in the
314  Registry and finally ingests the file itself into the Datastore.
315  Any needed Instrument, Detector, and PhysicalFilter DataUnit entries must
316  exist in the Registry before `run` is called.
317 
318  Parameters
319  ----------
320  file : `str` or path-like object
321  Absolute path to the file to be ingested.
322  """
323  headers, dataId = self.ensureDataUnits(file)
324  # We want ingesting a single file to be atomic even if we are
325  # not trying to ingest the list of files atomically.
326  with self.butler.transaction():
327  try:
328  self.ingestFile(file, headers, dataId)
329  return
330  except IngestConflictError:
331  if self.config.conflict == "fail":
332  raise
333  if self.config.conflict == "ignore":
334  if self.stashRun is not None:
335  if self.stashRun.id is None:
336  self.butler.registry.ensureRun(self.stashRun)
337  self.log.infof("Conflict on {} ({}); ingesting to stash '{}' instead.",
338  dataId, file, self.config.stash)
339  with self.butler.transaction():
340  self.ingestFile(file, headers, dataId, run=self.stashRun)
341  else:
342  self.log.infof("Conflict on {} ({}); ignoring.", dataId, file)
343 
344  def extractDataId(self, file, headers):
345  """Return the Data ID dictionary that should be used to label a file.
346 
347  Parameters
348  ----------
349  file : `str` or path-like object
350  Absolute path to the file being ingested (prior to any transfers).
351  headers : `list` of `~lsst.daf.base.PropertyList`
352  All headers returned by `readHeaders()`.
353 
354  Returns
355  -------
356  dataId : `dict`
357  Must include "instrument", "detector", and "exposure" keys. If the
358  Exposure is associated with a PhysicalFilter and/or Visit,
359  "physical_filter" and "visit" keys should be provided as well
360  (respectively).
361  """
362  obsInfo = ObservationInfo(headers[0])
363  return {
364  "instrument": obsInfo.instrument,
365  "exposure": obsInfo.exposure_id,
366  "visit": obsInfo.visit_id,
367  "detector": obsInfo.detector_num,
368  "physical_filter": obsInfo.physical_filter,
369  }
370 
371  def extractVisitEntry(self, file, headers, dataId, associated):
372  """Create a Visit DataUnit entry from raw file metadata.
373 
374  Parameters
375  ----------
376  file : `str` or path-like object
377  Absolute path to the file being ingested (prior to any transfers).
378  headers : `list` of `~lsst.daf.base.PropertyList`
379  All headers returned by `readHeaders()`.
380  dataId : `dict`
381  The data ID for this file. Implementations are permitted to
382  modify this dictionary (generally by stripping off "detector" and
383  "exposure" and adding new metadata key-value pairs) and return it.
384  associated : `dict`
385  A dictionary containing other associated DataUnit entries.
386  Guaranteed to have "Instrument", "Detector", and "PhysicalFilter"
387  keys, but the last may map to ``None`` if `extractDataId` either
388  did not contain a "physical_filter" key or mapped it to ``None``.
389  Also adds a "VisitInfo" key containing an `afw.image.VisitInfo`
390  object for use by `extractExposureEntry`.
391 
392  Returns
393  -------
394  entry : `dict`
395  Dictionary corresponding to an Visit database table row.
396  Must have all non-null columns in the Visit table as keys.
397  """
398  obsInfo = ObservationInfo(headers[0])
399  associated["ObsInfo"] = obsInfo
400  del dataId["detector"]
401  del dataId["exposure"]
402  return makeVisitEntryFromObsInfo(dataId, obsInfo)
403 
404  def extractExposureEntry(self, file, headers, dataId, associated):
405  """Create an Exposure DataUnit entry from raw file metadata.
406 
407  Parameters
408  ----------
409  file : `str` or path-like object
410  Absolute path to the file being ingested (prior to any transfers).
411  headers : `list` of `~lsst.daf.base.PropertyList`
412  All headers returned by `readHeaders()`.
413  dataId : `dict`
414  The data ID for this file. Implementations are permitted to
415  modify this dictionary (generally by stripping off "detector" and
416  adding new metadata key-value pairs) and return it.
417  associated : `dict`
418  A dictionary containing other associated DataUnit entries.
419  Guaranteed to have "Instrument", "Detector", "PhysicalFilter", and
420  "Visit" keys, but the latter two may map to ``None`` if
421  `extractDataId` did not contain keys for these or mapped them to
422  ``None``. May also contain additional keys added by
423  `extractVisitEntry`.
424 
425  Returns
426  -------
427  entry : `dict`
428  Dictionary corresponding to an Exposure database table row.
429  Must have all non-null columns in the Exposure table as keys.
430  """
431  try:
432  obsInfo = associated["ObsInfo"]
433  except KeyError:
434  obsInfo = ObservationInfo(headers[0])
435  del dataId["detector"]
436  return makeExposureEntryFromObsInfo(dataId, obsInfo)
437 
438  def getFormatter(self, file, headers, dataId):
439  """Return the Formatter that should be used to read this file after
440  ingestion.
441 
442  The default implementation returns None, which uses the formatter
443  configured for this DatasetType/StorageClass in the Butler.
444  """
445  return None
def extractExposureEntry(self, file, headers, dataId, associated)
Definition: ingest.py:404
def extractDataId(self, file, headers)
Definition: ingest.py:344
def __init__(self, config=None, butler, kwds)
Definition: ingest.py:130
def extractVisitEntry(self, file, headers, dataId, associated)
Definition: ingest.py:371
def ingestFile(self, file, headers, dataId, run=None)
Definition: ingest.py:268
def getFormatter(self, file, headers, dataId)
Definition: ingest.py:438