lsst.obs.base  20.0.0-54-gba713e9+a7d430d1e1
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 from dataclasses import dataclass, InitVar
27 from typing import List, Iterator, Iterable, Type, Optional, Any
28 from collections import defaultdict
29 from multiprocessing import Pool
30 
31 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
32 from lsst.afw.fits import readMetadata
33 from lsst.daf.butler import (
34  Butler,
35  CollectionType,
36  DataCoordinate,
37  DatasetRef,
38  DatasetType,
39  DimensionRecord,
40  DimensionUniverse,
41  FileDataset,
42  Formatter,
43 )
44 from lsst.pex.config import Config, ChoiceField
45 from lsst.pipe.base import Task
46 
47 from ._instrument import Instrument, makeExposureRecordFromObsInfo
48 from ._fitsRawFormatterBase import FitsRawFormatterBase
49 
50 
51 @dataclass
53  """Structure that holds information about a single dataset within a
54  raw file.
55  """
56 
57  dataId: DataCoordinate
58  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
59  """
60 
61  obsInfo: ObservationInfo
62  """Standardized observation metadata extracted directly from the file
63  headers (`astro_metadata_translator.ObservationInfo`).
64  """
65 
66 
67 @dataclass
69  """Structure that holds information about a single raw file, used during
70  ingest.
71  """
72 
73  datasets: List[RawFileDatasetInfo]
74  """The information describing each dataset within this raw file.
75  (`list` of `RawFileDatasetInfo`)
76  """
77 
78  filename: str
79  """Name of the file this information was extracted from (`str`).
80 
81  This is the path prior to ingest, not the path after ingest.
82  """
83 
84  FormatterClass: Type[FitsRawFormatterBase]
85  """Formatter class that should be used to ingest this file (`type`; as
86  subclass of `FitsRawFormatterBase`).
87  """
88 
89  instrumentClass: Optional[Type[Instrument]]
90  """The `Instrument` class associated with this file. Can be `None`
91  if ``datasets`` is an empty list."""
92 
93 
94 @dataclass
96  """Structure that holds information about a complete raw exposure, used
97  during ingest.
98  """
99 
100  dataId: DataCoordinate
101  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
102  """
103 
104  files: List[RawFileData]
105  """List of structures containing file-level information.
106  """
107 
108  universe: InitVar[DimensionUniverse]
109  """Set of all known dimensions.
110  """
111 
112  record: Optional[DimensionRecord] = None
113  """The exposure `DimensionRecord` that must be inserted into the
114  `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
115  """
116 
117  def __post_init__(self, universe: DimensionUniverse):
118  # We don't care which file or dataset we read metadata from, because
119  # we're assuming they'll all be the same; just use the first ones.
120  self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
121 
122 
123 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
124  """Create a Config field with options for how to transfer files between
125  data repositories.
126 
127  The allowed options for the field are exactly those supported by
128  `lsst.daf.butler.Datastore.ingest`.
129 
130  Parameters
131  ----------
132  doc : `str`
133  Documentation for the configuration field.
134 
135  Returns
136  -------
137  field : `lsst.pex.config.ChoiceField`
138  Configuration field.
139  """
140  return ChoiceField(
141  doc=doc,
142  dtype=str,
143  allowed={"move": "move",
144  "copy": "copy",
145  "auto": "choice will depend on datastore",
146  "link": "hard link falling back to symbolic link",
147  "hardlink": "hard link",
148  "symlink": "symbolic (soft) link",
149  "relsymlink": "relative symbolic link",
150  },
151  optional=True,
152  default=default
153  )
154 
155 
156 class RawIngestConfig(Config):
158 
159 
160 class RawIngestTask(Task):
161  """Driver Task for ingesting raw data into Gen3 Butler repositories.
162 
163  Parameters
164  ----------
165  config : `RawIngestConfig`
166  Configuration for the task.
167  butler : `~lsst.daf.butler.Butler`
168  Writeable butler instance, with ``butler.run`` set to the appropriate
169  `~lsst.daf.butler.CollectionType.RUN` collection for these raw
170  datasets.
171  **kwargs
172  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
173  constructor.
174 
175  Notes
176  -----
177  Each instance of `RawIngestTask` writes to the same Butler. Each
178  invocation of `RawIngestTask.run` ingests a list of files.
179  """
180 
181  ConfigClass = RawIngestConfig
182 
183  _DefaultName = "ingest"
184 
185  def getDatasetType(self):
186  """Return the DatasetType of the datasets ingested by this Task.
187  """
188  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
189  universe=self.butler.registry.dimensions)
190 
191  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any):
192  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
193  super().__init__(config, **kwargs)
194  self.butler = butler
195  self.universe = self.butler.registry.dimensions
197 
198  # Import all the instrument classes so that we ensure that we
199  # have all the relevant metadata translators loaded.
200  Instrument.importAll(self.butler.registry)
201 
202  @classmethod
203  # WARNING: this method hardcodes the parameters to pipe.base.Task.__init__.
204  # Nobody seems to know a way to delegate them to Task code.
205  def _makeTask(cls, config: RawIngestConfig, butler: Butler, name: str, parentTask: Task):
206  """Construct a RawIngestTask using only positional arguments.
207 
208  Parameters
209  ----------
210  All parameters are as for `RawIngestTask`.
211  """
212  return cls(config=config, butler=butler, name=name, parentTask=parentTask)
213 
214  # Overrides Task.__reduce__
215  def __reduce__(self):
216  return (self._makeTask, (self.config, self.butler, self._name, self._parentTask))
217 
218  def extractMetadata(self, filename: str) -> RawFileData:
219  """Extract and process metadata from a single raw file.
220 
221  Parameters
222  ----------
223  filename : `str`
224  Path to the file.
225 
226  Returns
227  -------
228  data : `RawFileData`
229  A structure containing the metadata extracted from the file,
230  as well as the original filename. All fields will be populated,
231  but the `RawFileData.dataId` attribute will be a minimal
232  (unexpanded) `DataCoordinate` instance.
233 
234  Notes
235  -----
236  Assumes that there is a single dataset associated with the given
237  file. Instruments using a single file to store multiple datasets
238  must implement their own version of this method.
239  """
240 
241  # We do not want to stop ingest if we are given a bad file.
242  # Instead return a RawFileData with no datasets and allow
243  # the caller to report the failure.
244 
245  try:
246  # Manually merge the primary and "first data" headers here because
247  # we do not know in general if an input file has set INHERIT=T.
248  phdu = readMetadata(filename, 0)
249  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
250  fix_header(header)
251  datasets = [self._calculate_dataset_info(header, filename)]
252  except Exception as e:
253  self.log.debug("Problem extracting metadata from %s: %s", filename, e)
254  # Indicate to the caller that we failed to read
255  datasets = []
256  FormatterClass = Formatter
257  instrument = None
258  else:
259  self.log.debug("Extracted metadata from file %s", filename)
260  # The data model currently assumes that whilst multiple datasets
261  # can be associated with a single file, they must all share the
262  # same formatter.
263  try:
264  instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry)
265  except LookupError:
266  self.log.warning("Instrument %s for file %s not known to registry",
267  datasets[0].dataId["instrument"], filename)
268  datasets = []
269  FormatterClass = Formatter
270  instrument = None
271  else:
272  FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
273 
274  return RawFileData(datasets=datasets, filename=filename,
275  FormatterClass=FormatterClass,
276  instrumentClass=instrument)
277 
278  def _calculate_dataset_info(self, header, filename):
279  """Calculate a RawFileDatasetInfo from the supplied information.
280 
281  Parameters
282  ----------
283  header : `Mapping`
284  Header from the dataset.
285  filename : `str`
286  Filename to use for error messages.
287 
288  Returns
289  -------
290  dataset : `RawFileDatasetInfo`
291  The dataId, and observation information associated with this
292  dataset.
293  """
294  obsInfo = ObservationInfo(header)
295  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
296  exposure=obsInfo.exposure_id,
297  detector=obsInfo.detector_num,
298  universe=self.universe)
299  return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
300 
301  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
302  """Group an iterable of `RawFileData` by exposure.
303 
304  Parameters
305  ----------
306  files : iterable of `RawFileData`
307  File-level information to group.
308 
309  Returns
310  -------
311  exposures : `list` of `RawExposureData`
312  A list of structures that group the file-level information by
313  exposure. All fields will be populated. The
314  `RawExposureData.dataId` attributes will be minimal (unexpanded)
315  `DataCoordinate` instances.
316  """
317  exposureDimensions = self.universe["exposure"].graph
318  byExposure = defaultdict(list)
319  for f in files:
320  # Assume that the first dataset is representative for the file
321  byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
322 
323  return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
324  for dataId, exposureFiles in byExposure.items()]
325 
326  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
327  """Expand the data IDs associated with a raw exposure to include
328  additional metadata records.
329 
330  Parameters
331  ----------
332  exposure : `RawExposureData`
333  A structure containing information about the exposure to be
334  ingested. Must have `RawExposureData.records` populated. Should
335  be considered consumed upon return.
336 
337  Returns
338  -------
339  exposure : `RawExposureData`
340  An updated version of the input structure, with
341  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
342  updated to data IDs for which `DataCoordinate.hasRecords` returns
343  `True`.
344  """
345  # We start by expanded the exposure-level data ID; we won't use that
346  # directly in file ingest, but this lets us do some database lookups
347  # once per exposure instead of once per file later.
348  data.dataId = self.butler.registry.expandDataId(
349  data.dataId,
350  # We pass in the records we'll be inserting shortly so they aren't
351  # looked up from the database. We do expect instrument and filter
352  # records to be retrieved from the database here (though the
353  # Registry may cache them so there isn't a lookup every time).
354  records={
355  self.butler.registry.dimensions["exposure"]: data.record,
356  }
357  )
358  # Now we expand the per-file (exposure+detector) data IDs. This time
359  # we pass in the records we just retrieved from the exposure data ID
360  # expansion.
361  for file in data.files:
362  for dataset in file.datasets:
363  dataset.dataId = self.butler.registry.expandDataId(
364  dataset.dataId,
365  records=dict(data.dataId.records)
366  )
367  return data
368 
369  def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
370  """Perform all ingest preprocessing steps that do not involve actually
371  modifying the database.
372 
373  Parameters
374  ----------
375  files : iterable over `str` or path-like objects
376  Paths to the files to be ingested. Will be made absolute
377  if they are not already.
378  pool : `multiprocessing.Pool`, optional
379  If not `None`, a process pool with which to parallelize some
380  operations.
381  processes : `int`, optional
382  The number of processes to use. Ignored if ``pool`` is not `None`.
383 
384  Yields
385  ------
386  exposure : `RawExposureData`
387  Data structures containing dimension records, filenames, and data
388  IDs to be ingested (one structure for each exposure).
389  bad_files : `list` of `str`
390  List of all the files that could not have metadata extracted.
391  """
392  if pool is None and processes > 1:
393  pool = Pool(processes)
394  mapFunc = map if pool is None else pool.imap_unordered
395 
396  # Extract metadata and build per-detector regions.
397  # This could run in a subprocess so collect all output
398  # before looking at failures.
399  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
400 
401  # Filter out all the failed reads and store them for later
402  # reporting
403  good_files = []
404  bad_files = []
405  for fileDatum in fileData:
406  if not fileDatum.datasets:
407  bad_files.append(fileDatum.filename)
408  else:
409  good_files.append(fileDatum)
410  fileData = good_files
411 
412  self.log.info("Successfully extracted metadata from %d file%s with %d failure%s",
413  len(fileData), "" if len(fileData) == 1 else "s",
414  len(bad_files), "" if len(bad_files) == 1 else "s")
415 
416  # Use that metadata to group files (and extracted metadata) by
417  # exposure. Never parallelized because it's intrinsically a gather
418  # step.
419  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
420 
421  # The next operation operates on RawExposureData instances (one at
422  # a time) in-place and then returns the modified instance. We call it
423  # as a pass-through instead of relying on the arguments we pass in to
424  # have been modified because in the parallel case those arguments are
425  # going to be pickled and unpickled, and I'm not certain
426  # multiprocessing is careful enough with that for output arguments to
427  # work.
428 
429  # Expand the data IDs to include all dimension metadata; we need this
430  # because we may need to generate path templates that rely on that
431  # metadata.
432  # This is the first step that involves actual database calls (but just
433  # SELECTs), so if there's going to be a problem with connections vs.
434  # multiple processes, or lock contention (in SQLite) slowing things
435  # down, it'll happen here.
436  return mapFunc(self.expandDataIds, exposureData), bad_files
437 
438  def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
439  ) -> List[DatasetRef]:
440  """Ingest all raw files in one exposure.
441 
442  Parameters
443  ----------
444  exposure : `RawExposureData`
445  A structure containing information about the exposure to be
446  ingested. Must have `RawExposureData.records` populated and all
447  data ID attributes expanded.
448  run : `str`, optional
449  Name of a RUN-type collection to write to, overriding
450  ``self.butler.run``.
451 
452  Returns
453  -------
454  refs : `list` of `lsst.daf.butler.DatasetRef`
455  Dataset references for ingested raws.
456  """
457  datasets = [FileDataset(path=os.path.abspath(file.filename),
458  refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
459  formatter=file.FormatterClass)
460  for file in exposure.files]
461  self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
462  return [ref for dataset in datasets for ref in dataset.refs]
463 
464  def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None):
465  """Ingest files into a Butler data repository.
466 
467  This creates any new exposure or visit Dimension entries needed to
468  identify the ingested files, creates new Dataset entries in the
469  Registry and finally ingests the files themselves into the Datastore.
470  Any needed instrument, detector, and physical_filter Dimension entries
471  must exist in the Registry before `run` is called.
472 
473  Parameters
474  ----------
475  files : iterable over `str` or path-like objects
476  Paths to the files to be ingested. Will be made absolute
477  if they are not already.
478  pool : `multiprocessing.Pool`, optional
479  If not `None`, a process pool with which to parallelize some
480  operations.
481  processes : `int`, optional
482  The number of processes to use. Ignored if ``pool`` is not `None`.
483  run : `str`, optional
484  Name of a RUN-type collection to write to, overriding
485  the default derived from the instrument name.
486 
487  Returns
488  -------
489  refs : `list` of `lsst.daf.butler.DatasetRef`
490  Dataset references for ingested raws.
491 
492  Notes
493  -----
494  This method inserts all datasets for an exposure within a transaction,
495  guaranteeing that partial exposures are never ingested. The exposure
496  dimension record is inserted with `Registry.syncDimensionData` first
497  (in its own transaction), which inserts only if a record with the same
498  primary key does not already exist. This allows different files within
499  the same exposure to be incremented in different runs.
500  """
501  exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
502  # Up to this point, we haven't modified the data repository at all.
503  # Now we finally do that, with one transaction per exposure. This is
504  # not parallelized at present because the performance of this step is
505  # limited by the database server. That may or may not change in the
506  # future once we increase our usage of bulk inserts and reduce our
507  # usage of savepoints; we've tried to get everything but the database
508  # operations done in advance to reduce the time spent inside
509  # transactions.
510  self.butler.registry.registerDatasetType(self.datasetType)
511  refs = []
512  runs = set()
513  n_exposures = 0
514  n_exposures_failed = 0
515  n_ingests_failed = 0
516  for exposure in exposureData:
517 
518  self.log.debug("Attempting to ingest %d file%s from exposure %s:%s",
519  len(exposure.files), "" if len(exposure.files) == 1 else "s",
520  exposure.record.instrument, exposure.record.name)
521 
522  try:
523  self.butler.registry.syncDimensionData("exposure", exposure.record)
524  except Exception as e:
525  n_exposures_failed += 1
526  self.log.warning("Exposure %s:%s could not be registered: %s",
527  exposure.record.instrument, exposure.record.name, e)
528  continue
529 
530  # Override default run if nothing specified explicitly
531  if run is None:
532  instrumentClass = exposure.files[0].instrumentClass
533  this_run = instrumentClass.makeDefaultRawIngestRunName()
534  else:
535  this_run = run
536  if this_run not in runs:
537  self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
538  runs.add(this_run)
539  try:
540  with self.butler.transaction():
541  refs.extend(self.ingestExposureDatasets(exposure, run=this_run))
542  except Exception as e:
543  n_ingests_failed += 1
544  self.log.warning("Failed to ingest the following for reason: %s", e)
545  for f in exposure.files:
546  self.log.warning("- %s", f.filename)
547  continue
548 
549  # Success for this exposure
550  n_exposures += 1
551  self.log.info("Exposure %s:%s ingested successfully",
552  exposure.record.instrument, exposure.record.name)
553 
554  had_failure = False
555 
556  if bad_files:
557  had_failure = True
558  self.log.warning("Could not extract observation metadata from the following:")
559  for f in bad_files:
560  self.log.warning("- %s", f)
561 
562  self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure"
563  " registration and %d failure%s from file ingest.",
564  n_exposures, "" if n_exposures == 1 else "s",
565  n_exposures_failed, "" if n_exposures_failed == 1 else "s",
566  n_ingests_failed, "" if n_ingests_failed == 1 else "s")
567  if n_exposures_failed > 0 or n_ingests_failed > 0:
568  had_failure = True
569  self.log.info("Ingested %d distinct Butler dataset%s",
570  len(refs), "" if len(refs) == 1 else "s")
571 
572  if had_failure:
573  raise RuntimeError("Some failures encountered during ingestion")
574 
575  return refs
lsst.obs.base.ingest.RawExposureData.__post_init__
def __post_init__(self, DimensionUniverse universe)
Definition: ingest.py:117
lsst.obs.base.ingest.RawIngestTask.extractMetadata
RawFileData extractMetadata(self, str filename)
Definition: ingest.py:218
lsst.obs.base.ingest.RawFileDatasetInfo
Definition: ingest.py:52
lsst.obs.base.ingest.RawIngestTask.butler
butler
Definition: ingest.py:194
lsst.obs.base.ingest.RawIngestTask.groupByExposure
List[RawExposureData] groupByExposure(self, Iterable[RawFileData] files)
Definition: ingest.py:301
lsst.obs.base.ingest.RawExposureData
Definition: ingest.py:95
lsst.obs.base.ingest.RawIngestTask.ingestExposureDatasets
List[DatasetRef] ingestExposureDatasets(self, RawExposureData exposure, *Optional[str] run=None)
Definition: ingest.py:438
lsst.obs.base.ingest.RawIngestConfig
Definition: ingest.py:156
lsst.obs.base.ingest.RawIngestTask.__init__
def __init__(self, Optional[RawIngestConfig] config=None, *Butler butler, **Any kwargs)
Definition: ingest.py:191
lsst.obs.base.ingest.RawExposureData.record
record
Definition: ingest.py:120
lsst.obs.base.ingest.makeTransferChoiceField
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto")
Definition: ingest.py:123
lsst.obs.base.ingest.RawFileData
Definition: ingest.py:68
lsst.obs.base._instrument.makeExposureRecordFromObsInfo
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: _instrument.py:683
lsst.obs.base.ingest.RawIngestTask._makeTask
def _makeTask(cls, RawIngestConfig config, Butler butler, str name, Task parentTask)
Definition: ingest.py:205
lsst.obs.base.ingest.RawIngestTask.prep
Iterator[RawExposureData] prep(self, files, *Optional[Pool] pool=None, int processes=1)
Definition: ingest.py:369
lsst.obs.base.ingest.RawIngestTask.run
def run(self, files, *Optional[Pool] pool=None, int processes=1, Optional[str] run=None)
Definition: ingest.py:464
lsst.obs.base.ingest.RawIngestTask.getDatasetType
def getDatasetType(self)
Definition: ingest.py:185
lsst.obs.base.ingest.RawIngestTask.universe
universe
Definition: ingest.py:195
lsst.obs.base.ingest.RawIngestTask.expandDataIds
RawExposureData expandDataIds(self, RawExposureData data)
Definition: ingest.py:326
lsst.obs.base.ingest.RawIngestTask
Definition: ingest.py:160
lsst.obs.base.ingest.RawIngestTask.__reduce__
def __reduce__(self)
Definition: ingest.py:215
lsst.obs.base.ingest.RawIngestTask.datasetType
datasetType
Definition: ingest.py:196
lsst.obs.base.ingest.RawIngestTask._calculate_dataset_info
def _calculate_dataset_info(self, header, filename)
Definition: ingest.py:278