lsst.obs.base  17.0.1-14-g9a818eb+2
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 
27 # This should really be an error that is caught in daf butler and rethrown
28 # with our own but it is not, so this exists here pending some error
29 # refactoring in daf butler
30 from sqlalchemy.exc import IntegrityError
31 
32 from astro_metadata_translator import ObservationInfo
33 from lsst.afw.image import readMetadata, bboxFromMetadata
34 from lsst.afw.geom import SkyWcs
35 from lsst.daf.butler import (DatasetType, StorageClassFactory, Run, DataId, ConflictingDefinitionError,
36  Butler)
37 from lsst.daf.butler.instrument import (Instrument, updateExposureEntryFromObsInfo,
38  updateVisitEntryFromObsInfo)
39 from lsst.geom import Box2D
40 from lsst.pex.config import Config, Field, ChoiceField
41 from lsst.pipe.base import Task
42 from lsst.sphgeom import ConvexPolygon
43 
44 
45 class IngestConflictError(ConflictingDefinitionError):
46  pass
47 
48 
49 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
50  return ChoiceField(
51  doc=doc,
52  dtype=str,
53  allowed={"move": "move",
54  "copy": "copy",
55  "hardlink": "hard link",
56  "symlink": "symbolic (soft) link"},
57  optional=True,
58  default=default
59  )
60 
61 
62 class RawIngestConfig(Config):
64  conflict = ChoiceField(
65  ("What to do if a raw Dataset with the same data ID as an "
66  "ingested file already exists in the Butler's Collection."),
67  dtype=str,
68  allowed={"ignore": ("Do not add the new file to the Collection. If "
69  "'stash' is not None, the new file will be "
70  "ingested into the stash Collection instead."),
71  "fail": ("Raise RuntimeError if a conflict is encountered "
72  "(which may then be caught if onError == 'continue')."),
73  },
74  optional=False,
75  default="ignore",
76  )
77  stash = Field(
78  "Name of an alternate Collection to hold Datasets that lose conflicts.",
79  dtype=str,
80  default=None,
81  )
82  onError = ChoiceField(
83  "What to do if an error (including fatal conflicts) occurs.",
84  dtype=str,
85  allowed={"continue": "Warn and continue with the next file.",
86  "break": ("Stop processing immediately, but leave "
87  "already-ingested datasets in the repository."),
88  "rollback": ("Stop processing and attempt to remove aleady-"
89  "ingested datasets from the repository."),
90  },
91  optional=False,
92  default="continue",
93  )
94  doAddRegions = Field(
95  dtype=bool,
96  default=True,
97  doc="Add regions when ingesting tasks"
98  )
99  padRegionAmount = Field(
100  dtype=int,
101  default=0,
102  doc="Pad an image with specified number of pixels before calculating region"
103  )
104 
105 
106 class RawIngestTask(Task):
107  """Driver Task for ingesting raw data into Gen3 Butler repositories.
108 
109  This Task is intended to be runnable from the command-line, but it doesn't
110  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
111  gain much from being one. It also wouldn't really be appropriate as a
112  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
113  leverage the logging and configurability functionality that provides.
114 
115  Each instance of `RawIngestTask` writes to the same Butler and maintains a
116  cache of Dimension entries that have already been added to or extracted
117  from its Registry. Each invocation of `RawIngestTask.run` ingests a list
118  of files (possibly semi-atomically; see `RawIngestConfig.onError`).
119 
120  RawIngestTask may be subclassed to specialize ingest for the actual
121  structure of raw data files produced by a particular instrument, but this
122  is usually unnecessary because the instrument-specific header-extraction
123  provided by the ``astro_metadata_translator`` is usually enough.
124 
125  Parameters
126  ----------
127  config : `RawIngestConfig`
128  Configuration for whether/how to transfer files and how to handle
129  conflicts and errors.
130  butler : `~lsst.daf.butler.Butler`
131  Butler instance. Ingested Datasets will be created as part of
132  ``butler.run`` and associated with its Collection.
133 
134  Other keyword arguments are forwarded to the Task base class constructor.
135  """
136 
137  ConfigClass = RawIngestConfig
138 
139  _DefaultName = "ingest"
140 
141  @classmethod
142  def getDatasetType(cls):
143  """Return the DatasetType of the Datasets ingested by this Task.
144  """
145  return DatasetType("raw", ("instrument", "detector", "exposure"),
146  StorageClassFactory().getStorageClass("Exposure"))
147 
148  def __init__(self, config=None, *, butler, **kwds):
149  super().__init__(config, **kwds)
150  self.butler = butler
152  self.dimensions = butler.registry.dimensions.extract(["instrument", "detector", "physical_filter",
153  "visit", "exposure"])
154  # Dictionary of {Dimension: set(DataId)} indicating Dimension entries
155  # we know are in the Registry.
156  self.dimensionEntriesDone = {k: set() for k in self.dimensions}
157  # Cache of instrument instances retrieved from Registry; needed to look
158  # up formatters.
159  self.instrumentCache = {}
160  # (Possibly) create a Run object for the "stash": where we put datasets
161  # that lose conflicts. Note that this doesn't actually add this Run
162  # to the Registry; we only do that on first use.
163  self.stashRun = Run(self.config.stash) if self.config.stash is not None else None
164  self.visitRegions = {}
165 
166  def _addVisitRegions(self):
167  """Adds a region associated with a Visit to registry.
168 
169  Visits will be created using regions for individual ccds that are
170  defined in the visitRegions dict field on self, joined against an
171  existing region if one exists. The dict field is formatted using
172  instrument and visit as a tuple for a key, with values that are a
173  list of regions for detectors associated the region.
174  """
175  for (instrument, visit), vertices in self.visitRegions.items():
176  # If there is an existing region it should be updated
177  existingRegion = self.butler.registry.expandDataId({"instrument": instrument, "visit": visit},
178  region=True).region
179  if existingRegion is not None:
180  vertices = list(existingRegion.getVertices()) + vertices
181  region = ConvexPolygon(vertices)
182  self.butler.registry.setDimensionRegion(instrument=instrument, visit=visit, region=region)
183 
184  def run(self, files):
185  """Ingest files into a Butler data repository.
186 
187  This creates any new exposure or visit Dimension entries needed to
188  identify the ingested files, creates new Dataset entries in the
189  Registry and finally ingests the files themselves into the Datastore.
190  Any needed instrument, detector, and physical_filter Dimension entries
191  must exist in the Registry before `run` is called.
192 
193  Parameters
194  ----------
195  files : iterable over `str` or path-like objects
196  Paths to the files to be ingested. Will be made absolute
197  if they are not already.
198  """
199  self.butler.registry.registerDatasetType(self.getDatasetType())
200  if self.config.onError == "rollback":
201  with self.butler.transaction():
202  for file in files:
203  self.processFile(os.path.abspath(file))
204  if self.config.doAddRegions:
205  self._addVisitRegions()
206  elif self.config.onError == "break":
207  for file in files:
208  self.processFile(os.path.abspath(file))
209  if self.config.doAddRegions:
210  self._addVisitRegions()
211  elif self.config.onError == "continue":
212  for file in files:
213  try:
214  self.processFile(os.path.abspath(file))
215  except Exception as err:
216  self.log.warnf("Error processing '{}': {}", file, err)
217  if self.config.doAddRegions:
218  self._addVisitRegions()
219 
220  def readHeaders(self, file):
221  """Read and return any relevant headers from the given file.
222 
223  The default implementation simply reads the header of the first
224  non-empty HDU, so it always returns a single-element list.
225 
226  Parameters
227  ----------
228  file : `str` or path-like object
229  Absolute path to the file to be ingested.
230 
231  Returns
232  -------
233  headers : `list` of `~lsst.daf.base.PropertyList`
234  Single-element list containing the header of the first
235  non-empty HDU.
236  """
237  return [readMetadata(file)]
238 
239  def buildRegion(self, headers):
240  """Builds a region from information contained in a header
241 
242  Parameters
243  ----------
244  headers : `lsst.daf.base.PropertyList`
245  Property list containing the information from the header of
246  one file.
247 
248  Returns
249  -------
250  region : `lsst.sphgeom.ConvexPolygon`
251 
252  Raises
253  ------
254  ValueError :
255  If required header keys can not be found to construct region
256  """
257  # Default implementation is for headers to be a one element list
258  header = headers[0]
259  wcs = SkyWcs(header)
260  bbox = Box2D(bboxFromMetadata(header))
261  if self.config.padRegionAmount > 0:
262  bbox.grow(self.config.padRegionAmount)
263  corners = bbox.getCorners()
264  sphCorners = [wcs.pixelToSky(point).getVector() for point in corners]
265  return ConvexPolygon(sphCorners)
266 
267  def ensureDimensions(self, file):
268  """Extract metadata from a raw file and add exposure and visit
269  Dimension entries.
270 
271  Any needed instrument, detector, and physical_filter Dimension entries must
272  exist in the Registry before `run` is called.
273 
274  Parameters
275  ----------
276  file : `str` or path-like object
277  Absolute path to the file to be ingested.
278 
279  Returns
280  -------
281  headers : `list` of `~lsst.daf.base.PropertyList`
282  Result of calling `readHeaders`.
283  dataId : `DataId`
284  Data ID dictionary, as returned by `extractDataId`.
285  """
286  headers = self.readHeaders(file)
287  obsInfo = ObservationInfo(headers[0])
288 
289  # Extract a DataId that covers all of self.dimensions.
290  fullDataId = self.extractDataId(file, headers, obsInfo=obsInfo)
291 
292  for dimension in self.dimensions:
293  if fullDataId.get(dimension.name) is None:
294  continue
295  dimensionDataId = DataId(fullDataId, dimension=dimension)
296  if dimensionDataId not in self.dimensionEntriesDone[dimension]:
297  # Next look in the Registry
298  dimensionEntryDict = self.butler.registry.findDimensionEntry(dimension, dimensionDataId)
299  if dimensionEntryDict is None:
300  if dimension.name in ("visit", "exposure"):
301  # Add the entry into the Registry.
302  self.butler.registry.addDimensionEntry(dimension, dimensionDataId)
303  else:
304  raise LookupError(
305  f"Entry for {dimension.name} with ID {dimensionDataId} not found; must be "
306  f"present in Registry prior to ingest."
307  )
308  # Record that we've handled this entry.
309  self.dimensionEntriesDone[dimension].add(dimensionDataId)
310  # Do this after the loop to ensure all the dimensions are added
311  if self.config.doAddRegions:
312  region = self.buildRegion(headers)
313  try:
314  self.butler.registry.setDimensionRegion(DataId(fullDataId,
315  dimensions=['visit', 'detector', 'instrument'],
316  region=region),
317  update=False)
318  self.visitRegions.setdefault((fullDataId['instrument'], fullDataId['visit']),
319  []).extend(region.getVertices())
320  except IntegrityError:
321  # This means that there were already regions for the dimensions in the database, and nothing
322  # should be done.
323  pass
324 
325  return headers, fullDataId
326 
327  def ingestFile(self, file, headers, dataId, run=None):
328  """Ingest a single raw file into the repository.
329 
330  All necessary Dimension entres must already be present.
331 
332  Parameters
333  ----------
334  file : `str` or path-like object
335  Absolute path to the file to be ingested.
336  headers : `list` of `~lsst.daf.base.PropertyList`
337  Result of calling `readHeaders`.
338  dataId : `dict`
339  Data ID dictionary, as returned by `extractDataId`.
340  run : `~lsst.daf.butler.Run`, optional
341  Run to add the Dataset to; defaults to ``self.butler.run``.
342 
343  Returns
344  -------
345  ref : `DatasetRef`
346  Reference to the ingested dataset.
347 
348  Raises
349  ------
350  ConflictingDefinitionError
351  Raised if the dataset already exists in the registry.
352  """
353  if run is not None and run != self.butler.run:
354  butler = Butler(butler=self.butler, run=run)
355  else:
356  butler = self.butler
357  try:
358  return butler.ingest(file, self.datasetType, dataId, transfer=self.config.transfer,
359  formatter=self.getFormatter(file, headers, dataId))
360  except ConflictingDefinitionError as err:
361  raise IngestConflictError("Ingest conflict on {} {}".format(file, dataId)) from err
362 
363  def processFile(self, file):
364  """Ingest a single raw data file after extacting metadata.
365 
366  This creates any new exposure or visit Dimension entries needed to
367  identify the ingest file, creates a new Dataset entry in the
368  Registry and finally ingests the file itself into the Datastore.
369  Any needed instrument, detector, and physical_filter Dimension entries must
370  exist in the Registry before `run` is called.
371 
372  Parameters
373  ----------
374  file : `str` or path-like object
375  Absolute path to the file to be ingested.
376  """
377  try:
378  headers, dataId = self.ensureDimensions(file)
379  except Exception as err:
380  raise RuntimeError(f"Unexpected error adding dimensions for {file}.") from err
381  # We want ingesting a single file to be atomic even if we are
382  # not trying to ingest the list of files atomically.
383  with self.butler.transaction():
384  try:
385  self.ingestFile(file, headers, dataId)
386  return
387  except IngestConflictError:
388  if self.config.conflict == "fail":
389  raise
390  if self.config.conflict == "ignore":
391  if self.stashRun is not None:
392  if self.stashRun.id is None:
393  self.butler.registry.ensureRun(self.stashRun)
394  self.log.infof("Conflict on {} ({}); ingesting to stash '{}' instead.",
395  dataId, file, self.config.stash)
396  with self.butler.transaction():
397  self.ingestFile(file, headers, dataId, run=self.stashRun)
398  else:
399  self.log.infof("Conflict on {} ({}); ignoring.", dataId, file)
400 
401  def extractDataId(self, file, headers, obsInfo):
402  """Return the Data ID dictionary that should be used to label a file.
403 
404  Parameters
405  ----------
406  file : `str` or path-like object
407  Absolute path to the file being ingested (prior to any transfers).
408  headers : `list` of `~lsst.daf.base.PropertyList`
409  All headers returned by `readHeaders()`.
410  obsInfo : `astro_metadata_translator.ObservationInfo`
411  Observational metadata extracted from the headers.
412 
413  Returns
414  -------
415  dataId : `DataId`
416  A mapping whose key-value pairs uniquely identify raw datasets.
417  Must have ``dataId.dimensions() <= self.dimensions``, with at least
418  instrument, exposure, and detector present.
419  """
420  toRemove = set()
421  if obsInfo.visit_id is None:
422  toRemove.add("visit")
423  if obsInfo.physical_filter is None:
424  toRemove.add("physical_filter")
425  if toRemove:
426  dimensions = self.dimensions.toSet().difference(toRemove)
427  else:
428  dimensions = self.dimensions
429  dataId = DataId(
430  dimensions=dimensions,
431  instrument=obsInfo.instrument,
432  exposure=obsInfo.exposure_id,
433  visit=obsInfo.visit_id,
434  detector=obsInfo.detector_num,
435  physical_filter=obsInfo.physical_filter,
436  )
437  updateExposureEntryFromObsInfo(dataId, obsInfo)
438  if obsInfo.visit_id is not None:
439  updateVisitEntryFromObsInfo(dataId, obsInfo)
440  return dataId
441 
442  def getFormatter(self, file, headers, dataId):
443  """Return the Formatter that should be used to read this file after
444  ingestion.
445 
446  The default implementation obtains the formatter from the Instrument
447  class for the given data ID.
448  """
449  instrument = self.instrumentCache.get(dataId["instrument"])
450  if instrument is None:
451  instrument = Instrument.factories[dataId["instrument"]]()
452  self.instrumentCache[dataId["instrument"]] = instrument
453  return instrument.getRawFormatter(dataId)
def extractDataId(self, file, headers, obsInfo)
Definition: ingest.py:401
def __init__(self, config=None, butler, kwds)
Definition: ingest.py:148
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:49
def ingestFile(self, file, headers, dataId, run=None)
Definition: ingest.py:327
def getFormatter(self, file, headers, dataId)
Definition: ingest.py:442