lsst.obs.base  16.0-26-gc0e79ff+5
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig")
24 
25 import os.path
26 from abc import ABCMeta
27 
28 from astro_metadata_translator import ObservationInfo
29 from lsst.afw.image import readMetadata
30 from lsst.daf.butler import DatasetType, StorageClassFactory, Run, DataId
31 from lsst.daf.butler.instrument import (Instrument, updateExposureEntryFromObsInfo,
32  updateVisitEntryFromObsInfo)
33 from lsst.pex.config import Config, Field, ChoiceField
34 from lsst.pipe.base import Task
35 
36 
37 class IngestConflictError(RuntimeError):
38  pass
39 
40 
41 class RawIngestConfig(Config):
42  transfer = ChoiceField(
43  ("How to transfer files (None for no transfer)."),
44  dtype=str,
45  allowed={"move": "move",
46  "copy": "copy",
47  "hardlink": "hard link",
48  "symlink": "symbolic (soft) link"},
49  optional=True,
50  )
51  conflict = ChoiceField(
52  ("What to do if a raw Dataset with the same data ID as an "
53  "ingested file already exists in the Butler's Collection."),
54  dtype=str,
55  allowed={"ignore": ("Do not add the new file to the Collection. If "
56  "'stash' is not None, the new file will be "
57  "ingested into the stash Collection instead."),
58  "fail": ("Raise RuntimeError if a conflict is encountered "
59  "(which may then be caught if onError == 'continue')."),
60  },
61  optional=False,
62  default="ignore",
63  )
64  stash = Field(
65  "Name of an alternate Collection to hold Datasets that lose conflicts.",
66  dtype=str,
67  default=None,
68  )
69  onError = ChoiceField(
70  "What to do if an error (including fatal conflicts) occurs.",
71  dtype=str,
72  allowed={"continue": "Warn and continue with the next file.",
73  "break": ("Stop processing immediately, but leave "
74  "already-ingested datasets in the repository."),
75  "rollback": ("Stop processing and attempt to remove aleady-"
76  "ingested datasets from the repository."),
77  },
78  optional=False,
79  default="continue",
80  )
81 
82 
83 class RawIngestTask(Task, metaclass=ABCMeta):
84  """Driver Task for ingesting raw data into Gen3 Butler repositories.
85 
86  This Task is intended to be runnable from the command-line, but it doesn't
87  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
88  gain much from being one. It also wouldn't really be appropriate as a
89  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
90  leverage the logging and configurability functionality that provides.
91 
92  Each instance of `RawIngestTask` writes to the same Butler and maintains a
93  cache of Dimension entries that have already been added to or extracted
94  from its Registry. Each invocation of `RawIngestTask.run` ingests a list
95  of files (possibly semi-atomically; see `RawIngestConfig.onError`).
96 
97  RawIngestTask should be subclassed to specialize ingest for the actual
98  structure of raw data files produced by a particular instrument.
99  Subclasses must either provide populated `MetadataReader` instances in the
100  `dataIdReader`, `visitReader`, and `exposureReader` class attributes, or
101  alternate implementations of the `extractDataId`, `extractVisit`, and
102  `extractExposure` methods that do not use those attributes (each
103  attribute-method pair may be handled differently). Subclasses may also
104  wish to override `getFormatter` and/or (rarely) `getDatasetType`. We do
105  not anticipate overriding `run`, `ensureDimensions`, `ingestFile`, or
106  `processFile` to ever be necessary.
107 
108  Parameters
109  ----------
110  config : `RawIngestConfig`
111  Configuration for whether/how to transfer files and how to handle
112  conflicts and errors.
113  butler : `~lsst.daf.butler.Butler`
114  Butler instance. Ingested Datasets will be created as part of
115  ``butler.run`` and associated with its Collection.
116 
117  Other keyword arguments are forwarded to the Task base class constructor.
118  """
119 
120  ConfigClass = RawIngestConfig
121 
122  _DefaultName = "ingest"
123 
124  @classmethod
125  def getDatasetType(cls):
126  """Return the DatasetType of the Datasets ingested by this Task.
127  """
128  return DatasetType("raw", ("Instrument", "Detector", "Exposure"),
129  StorageClassFactory().getStorageClass("Exposure"))
130 
131  def __init__(self, config=None, *, butler, **kwds):
132  super().__init__(config, **kwds)
133  self.butler = butler
135  self.dimensions = butler.registry.dimensions.extract(["Instrument", "Detector", "PhysicalFilter",
136  "Visit", "Exposure"])
137  # Dictionary of {Dimension: set(DataId)} indicating Dimension entries
138  # we know are in the Registry.
139  self.dimensionEntriesDone = {k: set() for k in self.dimensions}
140  # Cache of Instrument instances retrieved from Registry; needed to look
141  # up formatters.
142  self.instrumentCache = {}
143  # (Possibly) create a Run object for the "stash": where we put datasets
144  # that lose conflicts. Note that this doesn't actually add this Run
145  # to the Registry; we only do that on first use.
146  self.stashRun = Run(self.config.stash) if self.config.stash is not None else None
147 
148  def run(self, files):
149  """Ingest files into a Butler data repository.
150 
151  This creates any new Exposure or Visit Dimension entries needed to
152  identify the ingested files, creates new Dataset entries in the
153  Registry and finally ingests the files themselves into the Datastore.
154  Any needed Instrument, Detector, and PhysicalFilter Dimension entries
155  must exist in the Registry before `run` is called.
156 
157  Parameters
158  ----------
159  files : iterable over `str` or path-like objects
160  Paths to the files to be ingested. Will be made absolute
161  if they are not already.
162  """
163  self.butler.registry.registerDatasetType(self.getDatasetType())
164  if self.config.onError == "rollback":
165  with self.butler.transaction():
166  for file in files:
167  self.processFile(os.path.abspath(file))
168  elif self.config.onError == "break":
169  for file in files:
170  self.processFile(os.path.abspath(file))
171  elif self.config.onError == "continue":
172  for file in files:
173  try:
174  self.processFile(os.path.abspath(file))
175  except Exception as err:
176  self.log.warnf("Error processing '{}': {}", file, err)
177 
178  def readHeaders(self, file):
179  """Read and return any relevant headers from the given file.
180 
181  The default implementation simply reads the header of the first
182  non-empty HDU, so it always returns a single-element list.
183 
184  Parameters
185  ----------
186  file : `str` or path-like object
187  Absolute path to the file to be ingested.
188 
189  Returns
190  -------
191  headers : `list` of `~lsst.daf.base.PropertyList`
192  Single-element list containing the header of the first
193  non-empty HDU.
194  """
195  return [readMetadata(file)]
196 
197  def ensureDimensions(self, file):
198  """Extract metadata from a raw file and add Exposure and Visit
199  Dimension entries.
200 
201  Any needed Instrument, Detector, and PhysicalFilter Dimension entries must
202  exist in the Registry before `run` is called.
203 
204  Parameters
205  ----------
206  file : `str` or path-like object
207  Absolute path to the file to be ingested.
208 
209  Returns
210  -------
211  headers : `list` of `~lsst.daf.base.PropertyList`
212  Result of calling `readHeaders`.
213  dataId : `DataId`
214  Data ID dictionary, as returned by `extractDataId`.
215  """
216  headers = self.readHeaders(file)
217  obsInfo = ObservationInfo(headers[0])
218 
219  # Extract a DataId that covers all of self.dimensions.
220  fullDataId = self.extractDataId(file, headers, obsInfo=obsInfo)
221 
222  for dimension in self.dimensions:
223  dimensionDataId = DataId(fullDataId, dimension=dimension)
224  if dimensionDataId not in self.dimensionEntriesDone[dimension]:
225  # Next look in the Registry
226  dimensionEntryDict = self.butler.registry.findDimensionEntry(dimension, dimensionDataId)
227  if dimensionEntryDict is None:
228  if dimension.name in ("Visit", "Exposure"):
229  # Add the entry into the Registry.
230  self.butler.registry.addDimensionEntry(dimension, dimensionDataId)
231  else:
232  raise LookupError(
233  f"Entry for {dimension.name} with ID {dimensionDataId} not found; must be "
234  f"present in Registry prior to ingest."
235  )
236  # Record that we've handled this entry.
237  self.dimensionEntriesDone[dimension].add(dimensionDataId)
238 
239  return headers, fullDataId
240 
241  def ingestFile(self, file, headers, dataId, run=None):
242  """Ingest a single raw file into the repository.
243 
244  All necessary Dimension entres must already be present.
245 
246  This method is not transactional; it must be wrapped in a
247  ``with self.butler.transaction` block to make per-file ingest
248  atomic.
249 
250  Parameters
251  ----------
252  file : `str` or path-like object
253  Absolute path to the file to be ingested.
254  headers : `list` of `~lsst.daf.base.PropertyList`
255  Result of calling `readHeaders`.
256  dataId : `dict`
257  Data ID dictionary, as returned by `extractDataId`.
258  run : `~lsst.daf.butler.Run`, optional
259  Run to add the Dataset to; defaults to ``self.butler.run``.
260  """
261  if run is None:
262  run = self.butler.run
263 
264  # Add a Dataset entry to the Registry.
265  try:
266  # We use transactional=False here (a kwarg added by the
267  # @transactional decorator) to keep the conflict exception from
268  # starting a higher-level rollback - if we catch this exception,
269  # we don't want to have already started rolling back the ingest of
270  # *previous* files when config.onError=='rollback' but
271  # config.confict=='ignore'.
272  ref = self.butler.registry.addDataset(self.datasetType, dataId, run=run,
273  transactional=False, recursive=True)
274  except ValueError:
275  raise IngestConflictError("Ingest conflict on {} {}".format(file, dataId))
276 
277  # Ingest it into the Datastore.
278  self.butler.datastore.ingest(file, ref, formatter=self.getFormatter(file, headers, dataId),
279  transfer=self.config.transfer)
280  return None
281 
282  def processFile(self, file):
283  """Ingest a single raw data file after extacting metadata.
284 
285  This creates any new Exposure or Visit Dimension entries needed to
286  identify the ingest file, creates a new Dataset entry in the
287  Registry and finally ingests the file itself into the Datastore.
288  Any needed Instrument, Detector, and PhysicalFilter Dimension entries must
289  exist in the Registry before `run` is called.
290 
291  Parameters
292  ----------
293  file : `str` or path-like object
294  Absolute path to the file to be ingested.
295  """
296  headers, dataId = self.ensureDimensions(file)
297  # We want ingesting a single file to be atomic even if we are
298  # not trying to ingest the list of files atomically.
299  with self.butler.transaction():
300  try:
301  self.ingestFile(file, headers, dataId)
302  return
303  except IngestConflictError:
304  if self.config.conflict == "fail":
305  raise
306  if self.config.conflict == "ignore":
307  if self.stashRun is not None:
308  if self.stashRun.id is None:
309  self.butler.registry.ensureRun(self.stashRun)
310  self.log.infof("Conflict on {} ({}); ingesting to stash '{}' instead.",
311  dataId, file, self.config.stash)
312  with self.butler.transaction():
313  self.ingestFile(file, headers, dataId, run=self.stashRun)
314  else:
315  self.log.infof("Conflict on {} ({}); ignoring.", dataId, file)
316 
317  def extractDataId(self, file, headers, obsInfo):
318  """Return the Data ID dictionary that should be used to label a file.
319 
320  Parameters
321  ----------
322  file : `str` or path-like object
323  Absolute path to the file being ingested (prior to any transfers).
324  headers : `list` of `~lsst.daf.base.PropertyList`
325  All headers returned by `readHeaders()`.
326  obsInfo : `astro_metadata_translator.ObservationInfo`
327  Observational metadata extracted from the headers.
328 
329  Returns
330  -------
331  dataId : `DataId`
332  A mapping whose key-value pairs uniquely identify raw datasets.
333  Must have ``dataId.dimensions <= self.dimensions``, with at least
334  Instrument, Exposure, and Detector present.
335  """
336  toRemove = set()
337  if obsInfo.visit_id is None:
338  toRemove.add("Visit")
339  if obsInfo.physical_filter is None:
340  toRemove.add("PhysicalFilter")
341  if toRemove:
342  dimensions = self.dimensions.difference(toRemove)
343  else:
344  dimensions = self.dimensions
345  dataId = DataId(
346  dimensions=dimensions,
347  instrument=obsInfo.instrument,
348  exposure=obsInfo.exposure_id,
349  visit=obsInfo.visit_id,
350  detector=obsInfo.detector_num,
351  physical_filter=obsInfo.physical_filter,
352  )
353  updateExposureEntryFromObsInfo(dataId, obsInfo)
354  if obsInfo.visit_id is not None:
355  updateVisitEntryFromObsInfo(dataId, obsInfo)
356  return dataId
357 
358  def getFormatter(self, file, headers, dataId):
359  """Return the Formatter that should be used to read this file after
360  ingestion.
361 
362  The default implementation obtains the formatter from the Instrument
363  class for the given data ID.
364  """
365  instrument = self.instrumentCache.get(dataId["instrument"])
366  if instrument is None:
367  instrument = Instrument.factories[dataId["instrument"]]()
368  self.instrumentCache[dataId["instrument"]] = instrument
369  return instrument.getRawFormatter(dataId)
def extractDataId(self, file, headers, obsInfo)
Definition: ingest.py:317
def __init__(self, config=None, butler, kwds)
Definition: ingest.py:131
def ingestFile(self, file, headers, dataId, run=None)
Definition: ingest.py:241
def getFormatter(self, file, headers, dataId)
Definition: ingest.py:358