lsst.obs.base  18.0.0-5-ga900cbd
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 
27 # This should really be an error that is caught in daf butler and rethrown
28 # with our own but it is not, so this exists here pending some error
29 # refactoring in daf butler
30 from sqlalchemy.exc import IntegrityError
31 
32 from astro_metadata_translator import ObservationInfo
33 from lsst.afw.image import readMetadata, bboxFromMetadata
34 from lsst.afw.geom import SkyWcs
35 from lsst.daf.butler import DatasetType, Run, DataId, ConflictingDefinitionError, Butler
36 from lsst.daf.butler.instrument import (Instrument, updateExposureEntryFromObsInfo,
37  updateVisitEntryFromObsInfo)
38 from lsst.geom import Box2D
39 from lsst.pex.config import Config, Field, ChoiceField
40 from lsst.pipe.base import Task
41 from lsst.sphgeom import ConvexPolygon
42 
43 
44 class IngestConflictError(ConflictingDefinitionError):
45  pass
46 
47 
48 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
49  return ChoiceField(
50  doc=doc,
51  dtype=str,
52  allowed={"move": "move",
53  "copy": "copy",
54  "hardlink": "hard link",
55  "symlink": "symbolic (soft) link"},
56  optional=True,
57  default=default
58  )
59 
60 
61 class RawIngestConfig(Config):
63  conflict = ChoiceField(
64  ("What to do if a raw Dataset with the same data ID as an "
65  "ingested file already exists in the Butler's Collection."),
66  dtype=str,
67  allowed={"ignore": ("Do not add the new file to the Collection. If "
68  "'stash' is not None, the new file will be "
69  "ingested into the stash Collection instead."),
70  "fail": ("Raise RuntimeError if a conflict is encountered "
71  "(which may then be caught if onError == 'continue')."),
72  },
73  optional=False,
74  default="ignore",
75  )
76  stash = Field(
77  "Name of an alternate Collection to hold Datasets that lose conflicts.",
78  dtype=str,
79  default=None,
80  )
81  onError = ChoiceField(
82  "What to do if an error (including fatal conflicts) occurs.",
83  dtype=str,
84  allowed={"continue": "Warn and continue with the next file.",
85  "break": ("Stop processing immediately, but leave "
86  "already-ingested datasets in the repository."),
87  "rollback": ("Stop processing and attempt to remove aleady-"
88  "ingested datasets from the repository."),
89  },
90  optional=False,
91  default="continue",
92  )
93  doAddRegions = Field(
94  dtype=bool,
95  default=True,
96  doc="Add regions when ingesting tasks"
97  )
98  padRegionAmount = Field(
99  dtype=int,
100  default=0,
101  doc="Pad an image with specified number of pixels before calculating region"
102  )
103 
104 
105 class RawIngestTask(Task):
106  """Driver Task for ingesting raw data into Gen3 Butler repositories.
107 
108  This Task is intended to be runnable from the command-line, but it doesn't
109  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
110  gain much from being one. It also wouldn't really be appropriate as a
111  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
112  leverage the logging and configurability functionality that provides.
113 
114  Each instance of `RawIngestTask` writes to the same Butler and maintains a
115  cache of Dimension entries that have already been added to or extracted
116  from its Registry. Each invocation of `RawIngestTask.run` ingests a list
117  of files (possibly semi-atomically; see `RawIngestConfig.onError`).
118 
119  RawIngestTask may be subclassed to specialize ingest for the actual
120  structure of raw data files produced by a particular instrument, but this
121  is usually unnecessary because the instrument-specific header-extraction
122  provided by the ``astro_metadata_translator`` is usually enough.
123 
124  Parameters
125  ----------
126  config : `RawIngestConfig`
127  Configuration for whether/how to transfer files and how to handle
128  conflicts and errors.
129  butler : `~lsst.daf.butler.Butler`
130  Butler instance. Ingested Datasets will be created as part of
131  ``butler.run`` and associated with its Collection.
132 
133  Other keyword arguments are forwarded to the Task base class constructor.
134  """
135 
136  ConfigClass = RawIngestConfig
137 
138  _DefaultName = "ingest"
139 
140  def getDatasetType(self):
141  """Return the DatasetType of the Datasets ingested by this Task.
142  """
143  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
144  universe=self.butler.registry.dimensions)
145 
146  def __init__(self, config=None, *, butler, **kwds):
147  super().__init__(config, **kwds)
148  self.butler = butler
150  self.dimensions = butler.registry.dimensions.extract(["instrument", "detector", "physical_filter",
151  "visit", "exposure"])
152  # Dictionary of {Dimension: set(DataId)} indicating Dimension entries
153  # we know are in the Registry.
154  self.dimensionEntriesDone = {k: set() for k in self.dimensions}
155  # Cache of instrument instances retrieved from Registry; needed to look
156  # up formatters.
157  self.instrumentCache = {}
158  # (Possibly) create a Run object for the "stash": where we put datasets
159  # that lose conflicts. Note that this doesn't actually add this Run
160  # to the Registry; we only do that on first use.
161  self.stashRun = Run(self.config.stash) if self.config.stash is not None else None
162  self.visitRegions = {}
163 
164  def _addVisitRegions(self):
165  """Adds a region associated with a Visit to registry.
166 
167  Visits will be created using regions for individual ccds that are
168  defined in the visitRegions dict field on self, joined against an
169  existing region if one exists. The dict field is formatted using
170  instrument and visit as a tuple for a key, with values that are a
171  list of regions for detectors associated the region.
172  """
173  for (instrument, visit), vertices in self.visitRegions.items():
174  # If there is an existing region it should be updated
175  existingRegion = self.butler.registry.expandDataId({"instrument": instrument, "visit": visit},
176  region=True).region
177  if existingRegion is not None:
178  vertices = list(existingRegion.getVertices()) + vertices
179  region = ConvexPolygon(vertices)
180  self.butler.registry.setDimensionRegion(instrument=instrument, visit=visit, region=region)
181 
182  def run(self, files):
183  """Ingest files into a Butler data repository.
184 
185  This creates any new exposure or visit Dimension entries needed to
186  identify the ingested files, creates new Dataset entries in the
187  Registry and finally ingests the files themselves into the Datastore.
188  Any needed instrument, detector, and physical_filter Dimension entries
189  must exist in the Registry before `run` is called.
190 
191  Parameters
192  ----------
193  files : iterable over `str` or path-like objects
194  Paths to the files to be ingested. Will be made absolute
195  if they are not already.
196  """
197  self.butler.registry.registerDatasetType(self.getDatasetType())
198  if self.config.onError == "rollback":
199  with self.butler.transaction():
200  for file in files:
201  self.processFile(os.path.abspath(file))
202  if self.config.doAddRegions:
203  self._addVisitRegions()
204  elif self.config.onError == "break":
205  for file in files:
206  self.processFile(os.path.abspath(file))
207  if self.config.doAddRegions:
208  self._addVisitRegions()
209  elif self.config.onError == "continue":
210  for file in files:
211  try:
212  self.processFile(os.path.abspath(file))
213  except Exception as err:
214  self.log.warnf("Error processing '{}': {}", file, err)
215  if self.config.doAddRegions:
216  self._addVisitRegions()
217 
218  def readHeaders(self, file):
219  """Read and return any relevant headers from the given file.
220 
221  The default implementation simply reads the header of the first
222  non-empty HDU, so it always returns a single-element list.
223 
224  Parameters
225  ----------
226  file : `str` or path-like object
227  Absolute path to the file to be ingested.
228 
229  Returns
230  -------
231  headers : `list` of `~lsst.daf.base.PropertyList`
232  Single-element list containing the header of the first
233  non-empty HDU.
234  """
235  return [readMetadata(file)]
236 
237  def buildRegion(self, headers):
238  """Builds a region from information contained in a header
239 
240  Parameters
241  ----------
242  headers : `lsst.daf.base.PropertyList`
243  Property list containing the information from the header of
244  one file.
245 
246  Returns
247  -------
248  region : `lsst.sphgeom.ConvexPolygon`
249 
250  Raises
251  ------
252  ValueError :
253  If required header keys can not be found to construct region
254  """
255  # Default implementation is for headers to be a one element list
256  header = headers[0]
257  wcs = SkyWcs(header)
258  bbox = Box2D(bboxFromMetadata(header))
259  if self.config.padRegionAmount > 0:
260  bbox.grow(self.config.padRegionAmount)
261  corners = bbox.getCorners()
262  sphCorners = [wcs.pixelToSky(point).getVector() for point in corners]
263  return ConvexPolygon(sphCorners)
264 
265  def ensureDimensions(self, file):
266  """Extract metadata from a raw file and add exposure and visit
267  Dimension entries.
268 
269  Any needed instrument, detector, and physical_filter Dimension entries must
270  exist in the Registry before `run` is called.
271 
272  Parameters
273  ----------
274  file : `str` or path-like object
275  Absolute path to the file to be ingested.
276 
277  Returns
278  -------
279  headers : `list` of `~lsst.daf.base.PropertyList`
280  Result of calling `readHeaders`.
281  dataId : `DataId`
282  Data ID dictionary, as returned by `extractDataId`.
283  """
284  headers = self.readHeaders(file)
285  obsInfo = ObservationInfo(headers[0])
286 
287  # Extract a DataId that covers all of self.dimensions.
288  fullDataId = self.extractDataId(file, headers, obsInfo=obsInfo)
289 
290  for dimension in self.dimensions:
291  if fullDataId.get(dimension.name) is None:
292  continue
293  dimensionDataId = DataId(fullDataId, dimension=dimension)
294  if dimensionDataId not in self.dimensionEntriesDone[dimension]:
295  # Next look in the Registry
296  dimensionEntryDict = self.butler.registry.findDimensionEntry(dimension, dimensionDataId)
297  if dimensionEntryDict is None:
298  if dimension.name in ("visit", "exposure"):
299  # Add the entry into the Registry.
300  self.butler.registry.addDimensionEntry(dimension, dimensionDataId)
301  else:
302  raise LookupError(
303  f"Entry for {dimension.name} with ID {dimensionDataId} not found; must be "
304  f"present in Registry prior to ingest."
305  )
306  # Record that we've handled this entry.
307  self.dimensionEntriesDone[dimension].add(dimensionDataId)
308  # Do this after the loop to ensure all the dimensions are added
309  if self.config.doAddRegions:
310  region = self.buildRegion(headers)
311  try:
312  self.butler.registry.setDimensionRegion(DataId(fullDataId,
313  dimensions=['visit', 'detector', 'instrument'],
314  region=region),
315  update=False)
316  self.visitRegions.setdefault((fullDataId['instrument'], fullDataId['visit']),
317  []).extend(region.getVertices())
318  except IntegrityError:
319  # This means that there were already regions for the dimensions in the database, and nothing
320  # should be done.
321  pass
322 
323  return headers, fullDataId
324 
325  def ingestFile(self, file, headers, dataId, run=None):
326  """Ingest a single raw file into the repository.
327 
328  All necessary Dimension entres must already be present.
329 
330  Parameters
331  ----------
332  file : `str` or path-like object
333  Absolute path to the file to be ingested.
334  headers : `list` of `~lsst.daf.base.PropertyList`
335  Result of calling `readHeaders`.
336  dataId : `dict`
337  Data ID dictionary, as returned by `extractDataId`.
338  run : `~lsst.daf.butler.Run`, optional
339  Run to add the Dataset to; defaults to ``self.butler.run``.
340 
341  Returns
342  -------
343  ref : `DatasetRef`
344  Reference to the ingested dataset.
345 
346  Raises
347  ------
348  ConflictingDefinitionError
349  Raised if the dataset already exists in the registry.
350  """
351  if run is not None and run != self.butler.run:
352  butler = Butler(butler=self.butler, run=run)
353  else:
354  butler = self.butler
355  try:
356  return butler.ingest(file, self.datasetType, dataId, transfer=self.config.transfer,
357  formatter=self.getFormatter(file, headers, dataId))
358  except ConflictingDefinitionError as err:
359  raise IngestConflictError("Ingest conflict on {} {}".format(file, dataId)) from err
360 
361  def processFile(self, file):
362  """Ingest a single raw data file after extacting metadata.
363 
364  This creates any new exposure or visit Dimension entries needed to
365  identify the ingest file, creates a new Dataset entry in the
366  Registry and finally ingests the file itself into the Datastore.
367  Any needed instrument, detector, and physical_filter Dimension entries must
368  exist in the Registry before `run` is called.
369 
370  Parameters
371  ----------
372  file : `str` or path-like object
373  Absolute path to the file to be ingested.
374  """
375  try:
376  headers, dataId = self.ensureDimensions(file)
377  except Exception as err:
378  raise RuntimeError(f"Unexpected error adding dimensions for {file}.") from err
379  # We want ingesting a single file to be atomic even if we are
380  # not trying to ingest the list of files atomically.
381  with self.butler.transaction():
382  try:
383  self.ingestFile(file, headers, dataId)
384  return
385  except IngestConflictError:
386  if self.config.conflict == "fail":
387  raise
388  if self.config.conflict == "ignore":
389  if self.stashRun is not None:
390  if self.stashRun.id is None:
391  self.butler.registry.ensureRun(self.stashRun)
392  self.log.infof("Conflict on {} ({}); ingesting to stash '{}' instead.",
393  dataId, file, self.config.stash)
394  with self.butler.transaction():
395  self.ingestFile(file, headers, dataId, run=self.stashRun)
396  else:
397  self.log.infof("Conflict on {} ({}); ignoring.", dataId, file)
398 
399  def extractDataId(self, file, headers, obsInfo):
400  """Return the Data ID dictionary that should be used to label a file.
401 
402  Parameters
403  ----------
404  file : `str` or path-like object
405  Absolute path to the file being ingested (prior to any transfers).
406  headers : `list` of `~lsst.daf.base.PropertyList`
407  All headers returned by `readHeaders()`.
408  obsInfo : `astro_metadata_translator.ObservationInfo`
409  Observational metadata extracted from the headers.
410 
411  Returns
412  -------
413  dataId : `DataId`
414  A mapping whose key-value pairs uniquely identify raw datasets.
415  Must have ``dataId.dimensions() <= self.dimensions``, with at least
416  instrument, exposure, and detector present.
417  """
418  toRemove = set()
419  if obsInfo.visit_id is None:
420  toRemove.add("visit")
421  if obsInfo.physical_filter is None:
422  toRemove.add("physical_filter")
423  if toRemove:
424  dimensions = self.dimensions.toSet().difference(toRemove)
425  else:
426  dimensions = self.dimensions
427  dataId = DataId(
428  dimensions=dimensions,
429  instrument=obsInfo.instrument,
430  exposure=obsInfo.exposure_id,
431  visit=obsInfo.visit_id,
432  detector=obsInfo.detector_num,
433  physical_filter=obsInfo.physical_filter,
434  )
435  updateExposureEntryFromObsInfo(dataId, obsInfo)
436  if obsInfo.visit_id is not None:
437  updateVisitEntryFromObsInfo(dataId, obsInfo)
438  return dataId
439 
440  def getFormatter(self, file, headers, dataId):
441  """Return the Formatter that should be used to read this file after
442  ingestion.
443 
444  The default implementation obtains the formatter from the Instrument
445  class for the given data ID.
446  """
447  instrument = self.instrumentCache.get(dataId["instrument"])
448  if instrument is None:
449  instrument = Instrument.factories[dataId["instrument"]]()
450  self.instrumentCache[dataId["instrument"]] = instrument
451  return instrument.getRawFormatter(dataId)
def extractDataId(self, file, headers, obsInfo)
Definition: ingest.py:399
def __init__(self, config=None, butler, kwds)
Definition: ingest.py:146
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:48
def ingestFile(self, file, headers, dataId, run=None)
Definition: ingest.py:325
def getFormatter(self, file, headers, dataId)
Definition: ingest.py:440