lsst.obs.base  16.0-25-gaa04350
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig")
24 
25 import os.path
26 from abc import ABCMeta
27 
28 from astro_metadata_translator import ObservationInfo
29 from lsst.afw.image import readMetadata
30 from lsst.daf.butler import DatasetType, StorageClassFactory, Run, DataId
31 from lsst.daf.butler.instrument import updateExposureEntryFromObsInfo, updateVisitEntryFromObsInfo
32 from lsst.pex.config import Config, Field, ChoiceField
33 from lsst.pipe.base import Task
34 
35 
36 class IngestConflictError(RuntimeError):
37  pass
38 
39 
40 class RawIngestConfig(Config):
41  transfer = ChoiceField(
42  ("How to transfer files (None for no transfer)."),
43  dtype=str,
44  allowed={"move": "move",
45  "copy": "copy",
46  "hardlink": "hard link",
47  "symlink": "symbolic (soft) link"},
48  optional=True,
49  )
50  conflict = ChoiceField(
51  ("What to do if a raw Dataset with the same data ID as an "
52  "ingested file already exists in the Butler's Collection."),
53  dtype=str,
54  allowed={"ignore": ("Do not add the new file to the Collection. If "
55  "'stash' is not None, the new file will be "
56  "ingested into the stash Collection instead."),
57  "fail": ("Raise RuntimeError if a conflict is encountered "
58  "(which may then be caught if onError == 'continue')."),
59  },
60  optional=False,
61  default="ignore",
62  )
63  stash = Field(
64  "Name of an alternate Collection to hold Datasets that lose conflicts.",
65  dtype=str,
66  default=None,
67  )
68  onError = ChoiceField(
69  "What to do if an error (including fatal conflicts) occurs.",
70  dtype=str,
71  allowed={"continue": "Warn and continue with the next file.",
72  "break": ("Stop processing immediately, but leave "
73  "already-ingested datasets in the repository."),
74  "rollback": ("Stop processing and attempt to remove aleady-"
75  "ingested datasets from the repository."),
76  },
77  optional=False,
78  default="continue",
79  )
80 
81 
82 class RawIngestTask(Task, metaclass=ABCMeta):
83  """Driver Task for ingesting raw data into Gen3 Butler repositories.
84 
85  This Task is intended to be runnable from the command-line, but it doesn't
86  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
87  gain much from being one. It also wouldn't really be appropriate as a
88  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
89  leverage the logging and configurability functionality that provides.
90 
91  Each instance of `RawIngestTask` writes to the same Butler and maintains a
92  cache of Dimension entries that have already been added to or extracted
93  from its Registry. Each invocation of `RawIngestTask.run` ingests a list
94  of files (possibly semi-atomically; see `RawIngestConfig.onError`).
95 
96  RawIngestTask should be subclassed to specialize ingest for the actual
97  structure of raw data files produced by a particular instrument.
98  Subclasses must either provide populated `MetadataReader` instances in the
99  `dataIdReader`, `visitReader`, and `exposureReader` class attributes, or
100  alternate implementations of the `extractDataId`, `extractVisit`, and
101  `extractExposure` methods that do not use those attributes (each
102  attribute-method pair may be handled differently). Subclasses may also
103  wish to override `getFormatter` and/or (rarely) `getDatasetType`. We do
104  not anticipate overriding `run`, `ensureDimensions`, `ingestFile`, or
105  `processFile` to ever be necessary.
106 
107  Parameters
108  ----------
109  config : `RawIngestConfig`
110  Configuration for whether/how to transfer files and how to handle
111  conflicts and errors.
112  butler : `~lsst.daf.butler.Butler`
113  Butler instance. Ingested Datasets will be created as part of
114  ``butler.run`` and associated with its Collection.
115 
116  Other keyword arguments are forwarded to the Task base class constructor.
117  """
118 
119  ConfigClass = RawIngestConfig
120 
121  _DefaultName = "ingest"
122 
123  @classmethod
124  def getDatasetType(cls):
125  """Return the DatasetType of the Datasets ingested by this Task.
126  """
127  return DatasetType("raw", ("Instrument", "Detector", "Exposure"),
128  StorageClassFactory().getStorageClass("Exposure"))
129 
130  def __init__(self, config=None, *, butler, **kwds):
131  super().__init__(config, **kwds)
132  self.butler = butler
134  self.dimensions = butler.registry.dimensions.extract(["Instrument", "Detector", "PhysicalFilter",
135  "Visit", "Exposure"])
136  # Dictionary of {Dimension: set(DataId)} indicating Dimension entries
137  # we know are in the Registry.
138  self.dimensionEntriesDone = {k: set() for k in self.dimensions}
139  # (Possibly) create a Run object for the "stash": where we put datasets
140  # that lose conflicts. Note that this doesn't actually add this Run
141  # to the Registry; we only do that on first use.
142  self.stashRun = Run(self.config.stash) if self.config.stash is not None else None
143 
144  def run(self, files):
145  """Ingest files into a Butler data repository.
146 
147  This creates any new Exposure or Visit Dimension entries needed to
148  identify the ingested files, creates new Dataset entries in the
149  Registry and finally ingests the files themselves into the Datastore.
150  Any needed Instrument, Detector, and PhysicalFilter Dimension entries
151  must exist in the Registry before `run` is called.
152 
153  Parameters
154  ----------
155  files : iterable over `str` or path-like objects
156  Paths to the files to be ingested. Will be made absolute
157  if they are not already.
158  """
159  self.butler.registry.registerDatasetType(self.getDatasetType())
160  if self.config.onError == "rollback":
161  with self.butler.transaction():
162  for file in files:
163  self.processFile(os.path.abspath(file))
164  elif self.config.onError == "break":
165  for file in files:
166  self.processFile(os.path.abspath(file))
167  elif self.config.onError == "continue":
168  for file in files:
169  try:
170  self.processFile(os.path.abspath(file))
171  except Exception as err:
172  self.log.warnf("Error processing '{}': {}", file, err)
173 
174  def readHeaders(self, file):
175  """Read and return any relevant headers from the given file.
176 
177  The default implementation simply reads the header of the first
178  non-empty HDU, so it always returns a single-element list.
179 
180  Parameters
181  ----------
182  file : `str` or path-like object
183  Absolute path to the file to be ingested.
184 
185  Returns
186  -------
187  headers : `list` of `~lsst.daf.base.PropertyList`
188  Single-element list containing the header of the first
189  non-empty HDU.
190  """
191  return [readMetadata(file)]
192 
193  def ensureDimensions(self, file):
194  """Extract metadata from a raw file and add Exposure and Visit
195  Dimension entries.
196 
197  Any needed Instrument, Detector, and PhysicalFilter Dimension entries must
198  exist in the Registry before `run` is called.
199 
200  Parameters
201  ----------
202  file : `str` or path-like object
203  Absolute path to the file to be ingested.
204 
205  Returns
206  -------
207  headers : `list` of `~lsst.daf.base.PropertyList`
208  Result of calling `readHeaders`.
209  dataId : `DataId`
210  Data ID dictionary, as returned by `extractDataId`.
211  """
212  headers = self.readHeaders(file)
213  obsInfo = ObservationInfo(headers[0])
214 
215  # Extract a DataId that covers all of self.dimensions.
216  fullDataId = self.extractDataId(file, headers, obsInfo=obsInfo)
217 
218  for dimension in self.dimensions:
219  dimensionDataId = DataId(fullDataId, dimension=dimension)
220  if dimensionDataId not in self.dimensionEntriesDone[dimension]:
221  # Next look in the Registry
222  dimensionEntryDict = self.butler.registry.findDimensionEntry(dimension, dimensionDataId)
223  if dimensionEntryDict is None:
224  if dimension.name in ("Visit", "Exposure"):
225  # Add the entry into the Registry.
226  self.butler.registry.addDimensionEntry(dimension, dimensionDataId)
227  else:
228  raise LookupError(
229  f"Entry for {dimension.name} with ID {dimensionDataId} not found; must be "
230  f"present in Registry prior to ingest."
231  )
232  # Record that we've handled this entry.
233  self.dimensionEntriesDone[dimension].add(dimensionDataId)
234 
235  return headers, fullDataId
236 
237  def ingestFile(self, file, headers, dataId, run=None):
238  """Ingest a single raw file into the repository.
239 
240  All necessary Dimension entres must already be present.
241 
242  This method is not transactional; it must be wrapped in a
243  ``with self.butler.transaction` block to make per-file ingest
244  atomic.
245 
246  Parameters
247  ----------
248  file : `str` or path-like object
249  Absolute path to the file to be ingested.
250  headers : `list` of `~lsst.daf.base.PropertyList`
251  Result of calling `readHeaders`.
252  dataId : `dict`
253  Data ID dictionary, as returned by `extractDataId`.
254  run : `~lsst.daf.butler.Run`, optional
255  Run to add the Dataset to; defaults to ``self.butler.run``.
256  """
257  if run is None:
258  run = self.butler.run
259 
260  # Add a Dataset entry to the Registry.
261  try:
262  # We use transactional=False here (a kwarg added by the
263  # @transactional decorator) to keep the conflict exception from
264  # starting a higher-level rollback - if we catch this exception,
265  # we don't want to have already started rolling back the ingest of
266  # *previous* files when config.onError=='rollback' but
267  # config.confict=='ignore'.
268  ref = self.butler.registry.addDataset(self.datasetType, dataId, run=run,
269  transactional=False, recursive=True)
270  except ValueError:
271  raise IngestConflictError("Ingest conflict on {} {}".format(file, dataId))
272 
273  # Ingest it into the Datastore.
274  self.butler.datastore.ingest(file, ref, formatter=self.getFormatter(file, headers, dataId),
275  transfer=self.config.transfer)
276  return None
277 
278  def processFile(self, file):
279  """Ingest a single raw data file after extacting metadata.
280 
281  This creates any new Exposure or Visit Dimension entries needed to
282  identify the ingest file, creates a new Dataset entry in the
283  Registry and finally ingests the file itself into the Datastore.
284  Any needed Instrument, Detector, and PhysicalFilter Dimension entries must
285  exist in the Registry before `run` is called.
286 
287  Parameters
288  ----------
289  file : `str` or path-like object
290  Absolute path to the file to be ingested.
291  """
292  headers, dataId = self.ensureDimensions(file)
293  # We want ingesting a single file to be atomic even if we are
294  # not trying to ingest the list of files atomically.
295  with self.butler.transaction():
296  try:
297  self.ingestFile(file, headers, dataId)
298  return
299  except IngestConflictError:
300  if self.config.conflict == "fail":
301  raise
302  if self.config.conflict == "ignore":
303  if self.stashRun is not None:
304  if self.stashRun.id is None:
305  self.butler.registry.ensureRun(self.stashRun)
306  self.log.infof("Conflict on {} ({}); ingesting to stash '{}' instead.",
307  dataId, file, self.config.stash)
308  with self.butler.transaction():
309  self.ingestFile(file, headers, dataId, run=self.stashRun)
310  else:
311  self.log.infof("Conflict on {} ({}); ignoring.", dataId, file)
312 
313  def extractDataId(self, file, headers, obsInfo):
314  """Return the Data ID dictionary that should be used to label a file.
315 
316  Parameters
317  ----------
318  file : `str` or path-like object
319  Absolute path to the file being ingested (prior to any transfers).
320  headers : `list` of `~lsst.daf.base.PropertyList`
321  All headers returned by `readHeaders()`.
322 
323  Returns
324  -------
325  dataId : `DataId`
326  A mapping whose key-value pairs uniquely identify raw datasets.
327  Must have ``dimensions`` equal to ``self.dimensions``.
328  """
329  toRemove = set()
330  if obsInfo.visit_id is None:
331  toRemove.add("Visit")
332  if obsInfo.physical_filter is None:
333  toRemove.add("PhysicalFilter")
334  if toRemove:
335  dimensions = self.dimensions.difference(toRemove)
336  else:
337  dimensions = self.dimensions
338  dataId = DataId(
339  dimensions=dimensions,
340  instrument=obsInfo.instrument,
341  exposure=obsInfo.exposure_id,
342  visit=obsInfo.visit_id,
343  detector=obsInfo.detector_num,
344  physical_filter=obsInfo.physical_filter,
345  )
346  updateExposureEntryFromObsInfo(dataId, obsInfo)
347  if obsInfo.visit_id is not None:
348  updateVisitEntryFromObsInfo(dataId, obsInfo)
349  return dataId
350 
351  def getFormatter(self, file, headers, dataId):
352  """Return the Formatter that should be used to read this file after
353  ingestion.
354 
355  The default implementation returns None, which uses the formatter
356  configured for this DatasetType/StorageClass in the Butler.
357  """
358  return None
def extractDataId(self, file, headers, obsInfo)
Definition: ingest.py:313
def __init__(self, config=None, butler, kwds)
Definition: ingest.py:130
def ingestFile(self, file, headers, dataId, run=None)
Definition: ingest.py:237
def getFormatter(self, file, headers, dataId)
Definition: ingest.py:351