lsst.obs.base  20.0.0-67-g32d6278+0d86c2e7cd
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 from dataclasses import dataclass, InitVar
27 from typing import List, Iterator, Iterable, Type, Optional, Any
28 from collections import defaultdict
29 from multiprocessing import Pool
30 
31 from astro_metadata_translator import ObservationInfo, merge_headers
32 from lsst.afw.fits import readMetadata
33 from lsst.daf.butler import (
34  Butler,
35  CollectionType,
36  DataCoordinate,
37  DatasetRef,
38  DatasetType,
39  DimensionRecord,
40  DimensionUniverse,
41  FileDataset,
42  Formatter,
43 )
44 from lsst.pex.config import Config, ChoiceField
45 from lsst.pipe.base import Task
46 
47 from ._instrument import Instrument, makeExposureRecordFromObsInfo
48 from ._fitsRawFormatterBase import FitsRawFormatterBase
49 
50 
51 @dataclass
53  """Structure that holds information about a single dataset within a
54  raw file.
55  """
56 
57  dataId: DataCoordinate
58  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
59  """
60 
61  obsInfo: ObservationInfo
62  """Standardized observation metadata extracted directly from the file
63  headers (`astro_metadata_translator.ObservationInfo`).
64  """
65 
66 
67 @dataclass
69  """Structure that holds information about a single raw file, used during
70  ingest.
71  """
72 
73  datasets: List[RawFileDatasetInfo]
74  """The information describing each dataset within this raw file.
75  (`list` of `RawFileDatasetInfo`)
76  """
77 
78  filename: str
79  """Name of the file this information was extracted from (`str`).
80 
81  This is the path prior to ingest, not the path after ingest.
82  """
83 
84  FormatterClass: Type[FitsRawFormatterBase]
85  """Formatter class that should be used to ingest this file (`type`; as
86  subclass of `FitsRawFormatterBase`).
87  """
88 
89  instrumentClass: Optional[Type[Instrument]]
90  """The `Instrument` class associated with this file. Can be `None`
91  if ``datasets`` is an empty list."""
92 
93 
94 @dataclass
96  """Structure that holds information about a complete raw exposure, used
97  during ingest.
98  """
99 
100  dataId: DataCoordinate
101  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
102  """
103 
104  files: List[RawFileData]
105  """List of structures containing file-level information.
106  """
107 
108  universe: InitVar[DimensionUniverse]
109  """Set of all known dimensions.
110  """
111 
112  record: Optional[DimensionRecord] = None
113  """The exposure `DimensionRecord` that must be inserted into the
114  `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
115  """
116 
117  def __post_init__(self, universe: DimensionUniverse):
118  # We don't care which file or dataset we read metadata from, because
119  # we're assuming they'll all be the same; just use the first ones.
120  self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
121 
122 
123 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
124  """Create a Config field with options for how to transfer files between
125  data repositories.
126 
127  The allowed options for the field are exactly those supported by
128  `lsst.daf.butler.Datastore.ingest`.
129 
130  Parameters
131  ----------
132  doc : `str`
133  Documentation for the configuration field.
134 
135  Returns
136  -------
137  field : `lsst.pex.config.ChoiceField`
138  Configuration field.
139  """
140  return ChoiceField(
141  doc=doc,
142  dtype=str,
143  allowed={"move": "move",
144  "copy": "copy",
145  "auto": "choice will depend on datastore",
146  "link": "hard link falling back to symbolic link",
147  "hardlink": "hard link",
148  "symlink": "symbolic (soft) link",
149  "relsymlink": "relative symbolic link",
150  },
151  optional=True,
152  default=default
153  )
154 
155 
156 class RawIngestConfig(Config):
158 
159 
160 class RawIngestTask(Task):
161  """Driver Task for ingesting raw data into Gen3 Butler repositories.
162 
163  Parameters
164  ----------
165  config : `RawIngestConfig`
166  Configuration for the task.
167  butler : `~lsst.daf.butler.Butler`
168  Writeable butler instance, with ``butler.run`` set to the appropriate
169  `~lsst.daf.butler.CollectionType.RUN` collection for these raw
170  datasets.
171  **kwargs
172  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
173  constructor.
174 
175  Notes
176  -----
177  Each instance of `RawIngestTask` writes to the same Butler. Each
178  invocation of `RawIngestTask.run` ingests a list of files.
179  """
180 
181  ConfigClass = RawIngestConfig
182 
183  _DefaultName = "ingest"
184 
185  def getDatasetType(self):
186  """Return the DatasetType of the datasets ingested by this Task.
187  """
188  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
189  universe=self.butler.registry.dimensions)
190 
191  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any):
192  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
193  super().__init__(config, **kwargs)
194  self.butler = butler
195  self.universe = self.butler.registry.dimensions
197 
198  # Import all the instrument classes so that we ensure that we
199  # have all the relevant metadata translators loaded.
200  Instrument.importAll(self.butler.registry)
201 
202  def _reduce_kwargs(self):
203  # Add extra parameters to pickle
204  return dict(**super()._reduce_kwargs(), butler=self.butler)
205 
206  def extractMetadata(self, filename: str) -> RawFileData:
207  """Extract and process metadata from a single raw file.
208 
209  Parameters
210  ----------
211  filename : `str`
212  Path to the file.
213 
214  Returns
215  -------
216  data : `RawFileData`
217  A structure containing the metadata extracted from the file,
218  as well as the original filename. All fields will be populated,
219  but the `RawFileData.dataId` attribute will be a minimal
220  (unexpanded) `DataCoordinate` instance.
221 
222  Notes
223  -----
224  Assumes that there is a single dataset associated with the given
225  file. Instruments using a single file to store multiple datasets
226  must implement their own version of this method.
227  """
228 
229  # We do not want to stop ingest if we are given a bad file.
230  # Instead return a RawFileData with no datasets and allow
231  # the caller to report the failure.
232 
233  try:
234  # Manually merge the primary and "first data" headers here because
235  # we do not know in general if an input file has set INHERIT=T.
236  phdu = readMetadata(filename, 0)
237  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
238  datasets = [self._calculate_dataset_info(header, filename)]
239  except Exception as e:
240  self.log.debug("Problem extracting metadata from %s: %s", filename, e)
241  # Indicate to the caller that we failed to read
242  datasets = []
243  FormatterClass = Formatter
244  instrument = None
245  else:
246  self.log.debug("Extracted metadata from file %s", filename)
247  # The data model currently assumes that whilst multiple datasets
248  # can be associated with a single file, they must all share the
249  # same formatter.
250  try:
251  instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry)
252  except LookupError:
253  self.log.warning("Instrument %s for file %s not known to registry",
254  datasets[0].dataId["instrument"], filename)
255  datasets = []
256  FormatterClass = Formatter
257  instrument = None
258  else:
259  FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
260 
261  return RawFileData(datasets=datasets, filename=filename,
262  FormatterClass=FormatterClass,
263  instrumentClass=instrument)
264 
265  def _calculate_dataset_info(self, header, filename):
266  """Calculate a RawFileDatasetInfo from the supplied information.
267 
268  Parameters
269  ----------
270  header : `Mapping`
271  Header from the dataset.
272  filename : `str`
273  Filename to use for error messages.
274 
275  Returns
276  -------
277  dataset : `RawFileDatasetInfo`
278  The dataId, and observation information associated with this
279  dataset.
280  """
281  # To ensure we aren't slowed down for no reason, explicitly
282  # list here the properties we need for the schema
283  # Use a dict with values a boolean where True indicates
284  # that it is required that we calculate this property.
285  ingest_subset = {
286  "altaz_begin": False,
287  "boresight_rotation_coord": False,
288  "boresight_rotation_angle": False,
289  "dark_time": False,
290  "datetime_begin": True,
291  "datetime_end": True,
292  "detector_num": True,
293  "exposure_group": False,
294  "exposure_id": True,
295  "exposure_time": True,
296  "instrument": True,
297  "tracking_radec": False,
298  "object": False,
299  "observation_counter": False,
300  "observation_id": True,
301  "observation_reason": False,
302  "observation_type": True,
303  "observing_day": False,
304  "physical_filter": True,
305  "science_program": False,
306  "visit_id": False,
307  }
308 
309  obsInfo = ObservationInfo(header, pedantic=False, filename=filename,
310  required={k for k in ingest_subset if ingest_subset[k]},
311  subset=set(ingest_subset))
312 
313  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
314  exposure=obsInfo.exposure_id,
315  detector=obsInfo.detector_num,
316  universe=self.universe)
317  return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
318 
319  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
320  """Group an iterable of `RawFileData` by exposure.
321 
322  Parameters
323  ----------
324  files : iterable of `RawFileData`
325  File-level information to group.
326 
327  Returns
328  -------
329  exposures : `list` of `RawExposureData`
330  A list of structures that group the file-level information by
331  exposure. All fields will be populated. The
332  `RawExposureData.dataId` attributes will be minimal (unexpanded)
333  `DataCoordinate` instances.
334  """
335  exposureDimensions = self.universe["exposure"].graph
336  byExposure = defaultdict(list)
337  for f in files:
338  # Assume that the first dataset is representative for the file
339  byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
340 
341  return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
342  for dataId, exposureFiles in byExposure.items()]
343 
344  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
345  """Expand the data IDs associated with a raw exposure to include
346  additional metadata records.
347 
348  Parameters
349  ----------
350  exposure : `RawExposureData`
351  A structure containing information about the exposure to be
352  ingested. Must have `RawExposureData.records` populated. Should
353  be considered consumed upon return.
354 
355  Returns
356  -------
357  exposure : `RawExposureData`
358  An updated version of the input structure, with
359  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
360  updated to data IDs for which `DataCoordinate.hasRecords` returns
361  `True`.
362  """
363  # We start by expanded the exposure-level data ID; we won't use that
364  # directly in file ingest, but this lets us do some database lookups
365  # once per exposure instead of once per file later.
366  data.dataId = self.butler.registry.expandDataId(
367  data.dataId,
368  # We pass in the records we'll be inserting shortly so they aren't
369  # looked up from the database. We do expect instrument and filter
370  # records to be retrieved from the database here (though the
371  # Registry may cache them so there isn't a lookup every time).
372  records={
373  self.butler.registry.dimensions["exposure"]: data.record,
374  }
375  )
376  # Now we expand the per-file (exposure+detector) data IDs. This time
377  # we pass in the records we just retrieved from the exposure data ID
378  # expansion.
379  for file in data.files:
380  for dataset in file.datasets:
381  dataset.dataId = self.butler.registry.expandDataId(
382  dataset.dataId,
383  records=dict(data.dataId.records)
384  )
385  return data
386 
387  def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
388  """Perform all ingest preprocessing steps that do not involve actually
389  modifying the database.
390 
391  Parameters
392  ----------
393  files : iterable over `str` or path-like objects
394  Paths to the files to be ingested. Will be made absolute
395  if they are not already.
396  pool : `multiprocessing.Pool`, optional
397  If not `None`, a process pool with which to parallelize some
398  operations.
399  processes : `int`, optional
400  The number of processes to use. Ignored if ``pool`` is not `None`.
401 
402  Yields
403  ------
404  exposure : `RawExposureData`
405  Data structures containing dimension records, filenames, and data
406  IDs to be ingested (one structure for each exposure).
407  bad_files : `list` of `str`
408  List of all the files that could not have metadata extracted.
409  """
410  if pool is None and processes > 1:
411  pool = Pool(processes)
412  mapFunc = map if pool is None else pool.imap_unordered
413 
414  # Extract metadata and build per-detector regions.
415  # This could run in a subprocess so collect all output
416  # before looking at failures.
417  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
418 
419  # Filter out all the failed reads and store them for later
420  # reporting
421  good_files = []
422  bad_files = []
423  for fileDatum in fileData:
424  if not fileDatum.datasets:
425  bad_files.append(fileDatum.filename)
426  else:
427  good_files.append(fileDatum)
428  fileData = good_files
429 
430  self.log.info("Successfully extracted metadata from %d file%s with %d failure%s",
431  len(fileData), "" if len(fileData) == 1 else "s",
432  len(bad_files), "" if len(bad_files) == 1 else "s")
433 
434  # Use that metadata to group files (and extracted metadata) by
435  # exposure. Never parallelized because it's intrinsically a gather
436  # step.
437  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
438 
439  # The next operation operates on RawExposureData instances (one at
440  # a time) in-place and then returns the modified instance. We call it
441  # as a pass-through instead of relying on the arguments we pass in to
442  # have been modified because in the parallel case those arguments are
443  # going to be pickled and unpickled, and I'm not certain
444  # multiprocessing is careful enough with that for output arguments to
445  # work.
446 
447  # Expand the data IDs to include all dimension metadata; we need this
448  # because we may need to generate path templates that rely on that
449  # metadata.
450  # This is the first step that involves actual database calls (but just
451  # SELECTs), so if there's going to be a problem with connections vs.
452  # multiple processes, or lock contention (in SQLite) slowing things
453  # down, it'll happen here.
454  return mapFunc(self.expandDataIds, exposureData), bad_files
455 
456  def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
457  ) -> List[DatasetRef]:
458  """Ingest all raw files in one exposure.
459 
460  Parameters
461  ----------
462  exposure : `RawExposureData`
463  A structure containing information about the exposure to be
464  ingested. Must have `RawExposureData.records` populated and all
465  data ID attributes expanded.
466  run : `str`, optional
467  Name of a RUN-type collection to write to, overriding
468  ``self.butler.run``.
469 
470  Returns
471  -------
472  refs : `list` of `lsst.daf.butler.DatasetRef`
473  Dataset references for ingested raws.
474  """
475  datasets = [FileDataset(path=os.path.abspath(file.filename),
476  refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
477  formatter=file.FormatterClass)
478  for file in exposure.files]
479  self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
480  return [ref for dataset in datasets for ref in dataset.refs]
481 
482  def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None):
483  """Ingest files into a Butler data repository.
484 
485  This creates any new exposure or visit Dimension entries needed to
486  identify the ingested files, creates new Dataset entries in the
487  Registry and finally ingests the files themselves into the Datastore.
488  Any needed instrument, detector, and physical_filter Dimension entries
489  must exist in the Registry before `run` is called.
490 
491  Parameters
492  ----------
493  files : iterable over `str` or path-like objects
494  Paths to the files to be ingested. Will be made absolute
495  if they are not already.
496  pool : `multiprocessing.Pool`, optional
497  If not `None`, a process pool with which to parallelize some
498  operations.
499  processes : `int`, optional
500  The number of processes to use. Ignored if ``pool`` is not `None`.
501  run : `str`, optional
502  Name of a RUN-type collection to write to, overriding
503  the default derived from the instrument name.
504 
505  Returns
506  -------
507  refs : `list` of `lsst.daf.butler.DatasetRef`
508  Dataset references for ingested raws.
509 
510  Notes
511  -----
512  This method inserts all datasets for an exposure within a transaction,
513  guaranteeing that partial exposures are never ingested. The exposure
514  dimension record is inserted with `Registry.syncDimensionData` first
515  (in its own transaction), which inserts only if a record with the same
516  primary key does not already exist. This allows different files within
517  the same exposure to be incremented in different runs.
518  """
519  exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
520  # Up to this point, we haven't modified the data repository at all.
521  # Now we finally do that, with one transaction per exposure. This is
522  # not parallelized at present because the performance of this step is
523  # limited by the database server. That may or may not change in the
524  # future once we increase our usage of bulk inserts and reduce our
525  # usage of savepoints; we've tried to get everything but the database
526  # operations done in advance to reduce the time spent inside
527  # transactions.
528  self.butler.registry.registerDatasetType(self.datasetType)
529  refs = []
530  runs = set()
531  n_exposures = 0
532  n_exposures_failed = 0
533  n_ingests_failed = 0
534  for exposure in exposureData:
535 
536  self.log.debug("Attempting to ingest %d file%s from exposure %s:%s",
537  len(exposure.files), "" if len(exposure.files) == 1 else "s",
538  exposure.record.instrument, exposure.record.obs_id)
539 
540  try:
541  self.butler.registry.syncDimensionData("exposure", exposure.record)
542  except Exception as e:
543  n_exposures_failed += 1
544  self.log.warning("Exposure %s:%s could not be registered: %s",
545  exposure.record.instrument, exposure.record.obs_id, e)
546  continue
547 
548  # Override default run if nothing specified explicitly
549  if run is None:
550  instrumentClass = exposure.files[0].instrumentClass
551  this_run = instrumentClass.makeDefaultRawIngestRunName()
552  else:
553  this_run = run
554  if this_run not in runs:
555  self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
556  runs.add(this_run)
557  try:
558  with self.butler.transaction():
559  refs.extend(self.ingestExposureDatasets(exposure, run=this_run))
560  except Exception as e:
561  n_ingests_failed += 1
562  self.log.warning("Failed to ingest the following for reason: %s", e)
563  for f in exposure.files:
564  self.log.warning("- %s", f.filename)
565  continue
566 
567  # Success for this exposure
568  n_exposures += 1
569  self.log.info("Exposure %s:%s ingested successfully",
570  exposure.record.instrument, exposure.record.obs_id)
571 
572  had_failure = False
573 
574  if bad_files:
575  had_failure = True
576  self.log.warning("Could not extract observation metadata from the following:")
577  for f in bad_files:
578  self.log.warning("- %s", f)
579 
580  self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure"
581  " registration and %d failure%s from file ingest.",
582  n_exposures, "" if n_exposures == 1 else "s",
583  n_exposures_failed, "" if n_exposures_failed == 1 else "s",
584  n_ingests_failed, "" if n_ingests_failed == 1 else "s")
585  if n_exposures_failed > 0 or n_ingests_failed > 0:
586  had_failure = True
587  self.log.info("Ingested %d distinct Butler dataset%s",
588  len(refs), "" if len(refs) == 1 else "s")
589 
590  if had_failure:
591  raise RuntimeError("Some failures encountered during ingestion")
592 
593  return refs
lsst.obs.base.ingest.RawExposureData.__post_init__
def __post_init__(self, DimensionUniverse universe)
Definition: ingest.py:117
lsst.obs.base.ingest.RawIngestTask.extractMetadata
RawFileData extractMetadata(self, str filename)
Definition: ingest.py:206
lsst.obs.base.ingest.RawFileDatasetInfo
Definition: ingest.py:52
lsst.obs.base.ingest.RawIngestTask.butler
butler
Definition: ingest.py:194
lsst.obs.base.ingest.RawIngestTask.groupByExposure
List[RawExposureData] groupByExposure(self, Iterable[RawFileData] files)
Definition: ingest.py:319
lsst.obs.base.ingest.RawExposureData
Definition: ingest.py:95
lsst.obs.base.ingest.RawIngestTask.ingestExposureDatasets
List[DatasetRef] ingestExposureDatasets(self, RawExposureData exposure, *Optional[str] run=None)
Definition: ingest.py:456
lsst.obs.base.ingest.RawIngestConfig
Definition: ingest.py:156
lsst.obs.base.ingest.RawIngestTask.__init__
def __init__(self, Optional[RawIngestConfig] config=None, *Butler butler, **Any kwargs)
Definition: ingest.py:191
lsst.obs.base.ingest.RawExposureData.record
record
Definition: ingest.py:120
lsst.obs.base.ingest.makeTransferChoiceField
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto")
Definition: ingest.py:123
lsst.obs.base.ingest.RawFileData
Definition: ingest.py:68
lsst.obs.base._instrument.makeExposureRecordFromObsInfo
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: _instrument.py:702
lsst.obs.base.ingest.RawIngestTask.prep
Iterator[RawExposureData] prep(self, files, *Optional[Pool] pool=None, int processes=1)
Definition: ingest.py:387
lsst.obs.base.ingest.RawIngestTask.run
def run(self, files, *Optional[Pool] pool=None, int processes=1, Optional[str] run=None)
Definition: ingest.py:482
lsst.obs.base.ingest.RawIngestTask.getDatasetType
def getDatasetType(self)
Definition: ingest.py:185
lsst.obs.base.ingest.RawIngestTask.universe
universe
Definition: ingest.py:195
lsst.obs.base.ingest.RawIngestTask.expandDataIds
RawExposureData expandDataIds(self, RawExposureData data)
Definition: ingest.py:344
lsst.obs.base.ingest.RawIngestTask
Definition: ingest.py:160
lsst.obs.base.ingest.RawIngestTask.datasetType
datasetType
Definition: ingest.py:196
lsst.obs.base.ingest.RawIngestTask._calculate_dataset_info
def _calculate_dataset_info(self, header, filename)
Definition: ingest.py:265