lsst.obs.base  17.0.1-11-g20c7f65+4
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig")
24 
25 import os.path
26 from abc import ABCMeta
27 
28 # This should really be an error that is caught in daf butler and rethrown with our own
29 # but it is not, so this exists here pending some error refactoring in daf butler
30 from sqlalchemy.exc import IntegrityError
31 
32 from astro_metadata_translator import ObservationInfo
33 from lsst.afw.image import readMetadata, bboxFromMetadata
34 from lsst.afw.geom import SkyWcs
35 from lsst.daf.butler import DatasetType, StorageClassFactory, Run, DataId, ConflictingDefinitionError
36 from lsst.daf.butler.instrument import (Instrument, updateExposureEntryFromObsInfo,
37  updateVisitEntryFromObsInfo)
38 from lsst.geom import Box2D
39 from lsst.pex.config import Config, Field, ChoiceField
40 from lsst.pipe.base import Task
41 from lsst.sphgeom import ConvexPolygon
42 
43 
44 class IngestConflictError(ConflictingDefinitionError):
45  pass
46 
47 
48 class RawIngestConfig(Config):
49  transfer = ChoiceField(
50  ("How to transfer files (None for no transfer)."),
51  dtype=str,
52  allowed={"move": "move",
53  "copy": "copy",
54  "hardlink": "hard link",
55  "symlink": "symbolic (soft) link"},
56  optional=True,
57  )
58  conflict = ChoiceField(
59  ("What to do if a raw Dataset with the same data ID as an "
60  "ingested file already exists in the Butler's Collection."),
61  dtype=str,
62  allowed={"ignore": ("Do not add the new file to the Collection. If "
63  "'stash' is not None, the new file will be "
64  "ingested into the stash Collection instead."),
65  "fail": ("Raise RuntimeError if a conflict is encountered "
66  "(which may then be caught if onError == 'continue')."),
67  },
68  optional=False,
69  default="ignore",
70  )
71  stash = Field(
72  "Name of an alternate Collection to hold Datasets that lose conflicts.",
73  dtype=str,
74  default=None,
75  )
76  onError = ChoiceField(
77  "What to do if an error (including fatal conflicts) occurs.",
78  dtype=str,
79  allowed={"continue": "Warn and continue with the next file.",
80  "break": ("Stop processing immediately, but leave "
81  "already-ingested datasets in the repository."),
82  "rollback": ("Stop processing and attempt to remove aleady-"
83  "ingested datasets from the repository."),
84  },
85  optional=False,
86  default="continue",
87  )
88  doAddRegions = Field(
89  dtype=bool,
90  default=True,
91  doc="Add regions when ingesting tasks"
92  )
93  padRegionAmount = Field(
94  dtype=int,
95  default=0,
96  doc="Pad an image with specified number of pixels before calculating region"
97  )
98 
99 
100 class RawIngestTask(Task, metaclass=ABCMeta):
101  """Driver Task for ingesting raw data into Gen3 Butler repositories.
102 
103  This Task is intended to be runnable from the command-line, but it doesn't
104  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
105  gain much from being one. It also wouldn't really be appropriate as a
106  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
107  leverage the logging and configurability functionality that provides.
108 
109  Each instance of `RawIngestTask` writes to the same Butler and maintains a
110  cache of Dimension entries that have already been added to or extracted
111  from its Registry. Each invocation of `RawIngestTask.run` ingests a list
112  of files (possibly semi-atomically; see `RawIngestConfig.onError`).
113 
114  RawIngestTask should be subclassed to specialize ingest for the actual
115  structure of raw data files produced by a particular instrument.
116  Subclasses must either provide populated `MetadataReader` instances in the
117  `dataIdReader`, `visitReader`, and `exposureReader` class attributes, or
118  alternate implementations of the `extractDataId`, `extractVisit`, and
119  `extractExposure` methods that do not use those attributes (each
120  attribute-method pair may be handled differently). Subclasses may also
121  wish to override `getFormatter` and/or (rarely) `getDatasetType`. We do
122  not anticipate overriding `run`, `ensureDimensions`, `ingestFile`, or
123  `processFile` to ever be necessary.
124 
125  Parameters
126  ----------
127  config : `RawIngestConfig`
128  Configuration for whether/how to transfer files and how to handle
129  conflicts and errors.
130  butler : `~lsst.daf.butler.Butler`
131  Butler instance. Ingested Datasets will be created as part of
132  ``butler.run`` and associated with its Collection.
133 
134  Other keyword arguments are forwarded to the Task base class constructor.
135  """
136 
137  ConfigClass = RawIngestConfig
138 
139  _DefaultName = "ingest"
140 
141  @classmethod
142  def getDatasetType(cls):
143  """Return the DatasetType of the Datasets ingested by this Task.
144  """
145  return DatasetType("raw", ("instrument", "detector", "exposure"),
146  StorageClassFactory().getStorageClass("Exposure"))
147 
148  def __init__(self, config=None, *, butler, **kwds):
149  super().__init__(config, **kwds)
150  self.butler = butler
152  self.dimensions = butler.registry.dimensions.extract(["instrument", "detector", "physical_filter",
153  "visit", "exposure"])
154  # Dictionary of {Dimension: set(DataId)} indicating Dimension entries
155  # we know are in the Registry.
156  self.dimensionEntriesDone = {k: set() for k in self.dimensions}
157  # Cache of instrument instances retrieved from Registry; needed to look
158  # up formatters.
159  self.instrumentCache = {}
160  # (Possibly) create a Run object for the "stash": where we put datasets
161  # that lose conflicts. Note that this doesn't actually add this Run
162  # to the Registry; we only do that on first use.
163  self.stashRun = Run(self.config.stash) if self.config.stash is not None else None
164  self.visitRegions = {}
165 
166  def _addVisitRegions(self):
167  """Adds a region associated with a Visit to registry.
168 
169  Visits will be created using regions for individual ccds that are
170  defined in the visitRegions dict field on self, joined against an
171  existing region if one exists. The dict field is formatted using
172  instrument and visit as a tuple for a key, with values that are a
173  list of regions for detectors associated the region.
174  """
175  for (instrument, visit), vertices in self.visitRegions.items():
176  # If there is an existing region it should be updated
177  existingRegion = self.butler.registry.expandDataId({"instrument": instrument, "visit": visit},
178  region=True).region
179  if existingRegion is not None:
180  vertices = list(existingRegion.getVertices()) + vertices
181  region = ConvexPolygon(vertices)
182  self.butler.registry.setDimensionRegion(instrument=instrument, visit=visit, region=region)
183 
184  def run(self, files):
185  """Ingest files into a Butler data repository.
186 
187  This creates any new exposure or visit Dimension entries needed to
188  identify the ingested files, creates new Dataset entries in the
189  Registry and finally ingests the files themselves into the Datastore.
190  Any needed instrument, detector, and physical_filter Dimension entries
191  must exist in the Registry before `run` is called.
192 
193  Parameters
194  ----------
195  files : iterable over `str` or path-like objects
196  Paths to the files to be ingested. Will be made absolute
197  if they are not already.
198  """
199  self.butler.registry.registerDatasetType(self.getDatasetType())
200  if self.config.onError == "rollback":
201  with self.butler.transaction():
202  for file in files:
203  self.processFile(os.path.abspath(file))
204  if self.config.doAddRegions:
205  self._addVisitRegions()
206  elif self.config.onError == "break":
207  for file in files:
208  self.processFile(os.path.abspath(file))
209  if self.config.doAddRegions:
210  self._addVisitRegions()
211  elif self.config.onError == "continue":
212  for file in files:
213  try:
214  self.processFile(os.path.abspath(file))
215  except Exception as err:
216  self.log.warnf("Error processing '{}': {}", file, err)
217  if self.config.doAddRegions:
218  self._addVisitRegions()
219 
220  def readHeaders(self, file):
221  """Read and return any relevant headers from the given file.
222 
223  The default implementation simply reads the header of the first
224  non-empty HDU, so it always returns a single-element list.
225 
226  Parameters
227  ----------
228  file : `str` or path-like object
229  Absolute path to the file to be ingested.
230 
231  Returns
232  -------
233  headers : `list` of `~lsst.daf.base.PropertyList`
234  Single-element list containing the header of the first
235  non-empty HDU.
236  """
237  return [readMetadata(file)]
238 
239  def buildRegion(self, headers):
240  """Builds a region from information contained in a header
241 
242  Parameters
243  ----------
244  headers : `lsst.daf.base.PropertyList`
245  Property list containing the information from the header of
246  one file.
247 
248  Returns
249  -------
250  region : `lsst.sphgeom.ConvexPolygon`
251 
252  Raises
253  ------
254  ValueError :
255  If required header keys can not be found to construct region
256  """
257  # Default implementation is for headers to be a one element list
258  header = headers[0]
259  wcs = SkyWcs(header)
260  bbox = Box2D(bboxFromMetadata(header))
261  if self.config.padRegionAmount > 0:
262  bbox.grow(self.config.padRegionAmount)
263  corners = bbox.getCorners()
264  sphCorners = [wcs.pixelToSky(point).getVector() for point in corners]
265  return ConvexPolygon(sphCorners)
266 
267  def ensureDimensions(self, file):
268  """Extract metadata from a raw file and add exposure and visit
269  Dimension entries.
270 
271  Any needed instrument, detector, and physical_filter Dimension entries must
272  exist in the Registry before `run` is called.
273 
274  Parameters
275  ----------
276  file : `str` or path-like object
277  Absolute path to the file to be ingested.
278 
279  Returns
280  -------
281  headers : `list` of `~lsst.daf.base.PropertyList`
282  Result of calling `readHeaders`.
283  dataId : `DataId`
284  Data ID dictionary, as returned by `extractDataId`.
285  """
286  headers = self.readHeaders(file)
287  obsInfo = ObservationInfo(headers[0])
288 
289  # Extract a DataId that covers all of self.dimensions.
290  fullDataId = self.extractDataId(file, headers, obsInfo=obsInfo)
291 
292  for dimension in self.dimensions:
293  dimensionDataId = DataId(fullDataId, dimension=dimension)
294  if dimensionDataId not in self.dimensionEntriesDone[dimension]:
295  # Next look in the Registry
296  dimensionEntryDict = self.butler.registry.findDimensionEntry(dimension, dimensionDataId)
297  if dimensionEntryDict is None:
298  if dimension.name in ("visit", "exposure"):
299  # Add the entry into the Registry.
300  self.butler.registry.addDimensionEntry(dimension, dimensionDataId)
301  else:
302  raise LookupError(
303  f"Entry for {dimension.name} with ID {dimensionDataId} not found; must be "
304  f"present in Registry prior to ingest."
305  )
306  # Record that we've handled this entry.
307  self.dimensionEntriesDone[dimension].add(dimensionDataId)
308  # Do this after the loop to ensure all the dimensions are added
309  if self.config.doAddRegions:
310  region = self.buildRegion(headers)
311  try:
312  self.butler.registry.setDimensionRegion(DataId(fullDataId,
313  dimensions=['visit', 'detector', 'instrument'],
314  region=region),
315  update=False)
316  self.visitRegions.setdefault((fullDataId['instrument'], fullDataId['visit']),
317  []).extend(region.getVertices())
318  except IntegrityError:
319  # This means that there were already regions for the dimensions in the database, and nothing
320  # should be done.
321  pass
322 
323  return headers, fullDataId
324 
325  def ingestFile(self, file, headers, dataId, run=None):
326  """Ingest a single raw file into the repository.
327 
328  All necessary Dimension entres must already be present.
329 
330  This method is not transactional; it must be wrapped in a
331  ``with self.butler.transaction` block to make per-file ingest
332  atomic.
333 
334  Parameters
335  ----------
336  file : `str` or path-like object
337  Absolute path to the file to be ingested.
338  headers : `list` of `~lsst.daf.base.PropertyList`
339  Result of calling `readHeaders`.
340  dataId : `dict`
341  Data ID dictionary, as returned by `extractDataId`.
342  run : `~lsst.daf.butler.Run`, optional
343  Run to add the Dataset to; defaults to ``self.butler.run``.
344  """
345  if run is None:
346  run = self.butler.run
347 
348  # Add a Dataset entry to the Registry.
349  try:
350  ref = self.butler.registry.addDataset(self.datasetType, dataId, run=run, recursive=True)
351  except ConflictingDefinitionError as err:
352  raise IngestConflictError("Ingest conflict on {} {}".format(file, dataId)) from err
353 
354  # Ingest it into the Datastore.
355  self.butler.datastore.ingest(file, ref, formatter=self.getFormatter(file, headers, dataId),
356  transfer=self.config.transfer)
357  return None
358 
359  def processFile(self, file):
360  """Ingest a single raw data file after extacting metadata.
361 
362  This creates any new exposure or visit Dimension entries needed to
363  identify the ingest file, creates a new Dataset entry in the
364  Registry and finally ingests the file itself into the Datastore.
365  Any needed instrument, detector, and physical_filter Dimension entries must
366  exist in the Registry before `run` is called.
367 
368  Parameters
369  ----------
370  file : `str` or path-like object
371  Absolute path to the file to be ingested.
372  """
373  headers, dataId = self.ensureDimensions(file)
374  # We want ingesting a single file to be atomic even if we are
375  # not trying to ingest the list of files atomically.
376  with self.butler.transaction():
377  try:
378  self.ingestFile(file, headers, dataId)
379  return
380  except IngestConflictError:
381  if self.config.conflict == "fail":
382  raise
383  if self.config.conflict == "ignore":
384  if self.stashRun is not None:
385  if self.stashRun.id is None:
386  self.butler.registry.ensureRun(self.stashRun)
387  self.log.infof("Conflict on {} ({}); ingesting to stash '{}' instead.",
388  dataId, file, self.config.stash)
389  with self.butler.transaction():
390  self.ingestFile(file, headers, dataId, run=self.stashRun)
391  else:
392  self.log.infof("Conflict on {} ({}); ignoring.", dataId, file)
393 
394  def extractDataId(self, file, headers, obsInfo):
395  """Return the Data ID dictionary that should be used to label a file.
396 
397  Parameters
398  ----------
399  file : `str` or path-like object
400  Absolute path to the file being ingested (prior to any transfers).
401  headers : `list` of `~lsst.daf.base.PropertyList`
402  All headers returned by `readHeaders()`.
403  obsInfo : `astro_metadata_translator.ObservationInfo`
404  Observational metadata extracted from the headers.
405 
406  Returns
407  -------
408  dataId : `DataId`
409  A mapping whose key-value pairs uniquely identify raw datasets.
410  Must have ``dataId.dimensions() <= self.dimensions``, with at least
411  instrument, exposure, and detector present.
412  """
413  toRemove = set()
414  if obsInfo.visit_id is None:
415  toRemove.add("visit")
416  if obsInfo.physical_filter is None:
417  toRemove.add("physical_filter")
418  if toRemove:
419  dimensions = self.dimensions.difference(toRemove)
420  else:
421  dimensions = self.dimensions
422  dataId = DataId(
423  dimensions=dimensions,
424  instrument=obsInfo.instrument,
425  exposure=obsInfo.exposure_id,
426  visit=obsInfo.visit_id,
427  detector=obsInfo.detector_num,
428  physical_filter=obsInfo.physical_filter,
429  )
430  updateExposureEntryFromObsInfo(dataId, obsInfo)
431  if obsInfo.visit_id is not None:
432  updateVisitEntryFromObsInfo(dataId, obsInfo)
433  return dataId
434 
435  def getFormatter(self, file, headers, dataId):
436  """Return the Formatter that should be used to read this file after
437  ingestion.
438 
439  The default implementation obtains the formatter from the Instrument
440  class for the given data ID.
441  """
442  instrument = self.instrumentCache.get(dataId["instrument"])
443  if instrument is None:
444  instrument = Instrument.factories[dataId["instrument"]]()
445  self.instrumentCache[dataId["instrument"]] = instrument
446  return instrument.getRawFormatter(dataId)
def extractDataId(self, file, headers, obsInfo)
Definition: ingest.py:394
def __init__(self, config=None, butler, kwds)
Definition: ingest.py:148
def ingestFile(self, file, headers, dataId, run=None)
Definition: ingest.py:325
def getFormatter(self, file, headers, dataId)
Definition: ingest.py:435