lsst.obs.base  20.0.0-46-g00fa051+95f8c0b24e
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 from dataclasses import dataclass, InitVar
27 from typing import List, Iterator, Iterable, Type, Optional, Any
28 from collections import defaultdict
29 from multiprocessing import Pool
30 
31 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
32 from lsst.afw.fits import readMetadata
33 from lsst.daf.butler import (
34  Butler,
35  CollectionType,
36  DataCoordinate,
37  DatasetRef,
38  DatasetType,
39  DimensionRecord,
40  DimensionUniverse,
41  FileDataset,
42 )
43 from lsst.pex.config import Config, ChoiceField
44 from lsst.pipe.base import Task
45 
46 from ._instrument import Instrument, makeExposureRecordFromObsInfo
47 from ._fitsRawFormatterBase import FitsRawFormatterBase
48 
49 
50 @dataclass
52  """Structure that holds information about a single dataset within a
53  raw file.
54  """
55 
56  dataId: DataCoordinate
57  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
58  """
59 
60  obsInfo: ObservationInfo
61  """Standardized observation metadata extracted directly from the file
62  headers (`astro_metadata_translator.ObservationInfo`).
63  """
64 
65 
66 @dataclass
68  """Structure that holds information about a single raw file, used during
69  ingest.
70  """
71 
72  datasets: List[RawFileDatasetInfo]
73  """The information describing each dataset within this raw file.
74  (`list` of `RawFileDatasetInfo`)
75  """
76 
77  filename: str
78  """Name of the file this information was extracted from (`str`).
79 
80  This is the path prior to ingest, not the path after ingest.
81  """
82 
83  FormatterClass: Type[FitsRawFormatterBase]
84  """Formatter class that should be used to ingest this file (`type`; as
85  subclass of `FitsRawFormatterBase`).
86  """
87 
88  instrumentClass: Type[Instrument]
89  """The `Instrument` class associated with this file."""
90 
91 
92 @dataclass
94  """Structure that holds information about a complete raw exposure, used
95  during ingest.
96  """
97 
98  dataId: DataCoordinate
99  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
100  """
101 
102  files: List[RawFileData]
103  """List of structures containing file-level information.
104  """
105 
106  universe: InitVar[DimensionUniverse]
107  """Set of all known dimensions.
108  """
109 
110  record: Optional[DimensionRecord] = None
111  """The exposure `DimensionRecord` that must be inserted into the
112  `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
113  """
114 
115  def __post_init__(self, universe: DimensionUniverse):
116  # We don't care which file or dataset we read metadata from, because
117  # we're assuming they'll all be the same; just use the first ones.
118  self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
119 
120 
121 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
122  """Create a Config field with options for how to transfer files between
123  data repositories.
124 
125  The allowed options for the field are exactly those supported by
126  `lsst.daf.butler.Datastore.ingest`.
127 
128  Parameters
129  ----------
130  doc : `str`
131  Documentation for the configuration field.
132 
133  Returns
134  -------
135  field : `lsst.pex.config.ChoiceField`
136  Configuration field.
137  """
138  return ChoiceField(
139  doc=doc,
140  dtype=str,
141  allowed={"move": "move",
142  "copy": "copy",
143  "auto": "choice will depend on datastore",
144  "link": "hard link falling back to symbolic link",
145  "hardlink": "hard link",
146  "symlink": "symbolic (soft) link",
147  "relsymlink": "relative symbolic link",
148  },
149  optional=True,
150  default=default
151  )
152 
153 
154 class RawIngestConfig(Config):
156 
157 
158 class RawIngestTask(Task):
159  """Driver Task for ingesting raw data into Gen3 Butler repositories.
160 
161  Parameters
162  ----------
163  config : `RawIngestConfig`
164  Configuration for the task.
165  butler : `~lsst.daf.butler.Butler`
166  Writeable butler instance, with ``butler.run`` set to the appropriate
167  `~lsst.daf.butler.CollectionType.RUN` collection for these raw
168  datasets.
169  **kwargs
170  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
171  constructor.
172 
173  Notes
174  -----
175  Each instance of `RawIngestTask` writes to the same Butler. Each
176  invocation of `RawIngestTask.run` ingests a list of files.
177  """
178 
179  ConfigClass = RawIngestConfig
180 
181  _DefaultName = "ingest"
182 
183  def getDatasetType(self):
184  """Return the DatasetType of the datasets ingested by this Task.
185  """
186  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
187  universe=self.butler.registry.dimensions)
188 
189  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any):
190  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
191  super().__init__(config, **kwargs)
192  self.butler = butler
193  self.universe = self.butler.registry.dimensions
195 
196  # Import all the instrument classes so that we ensure that we
197  # have all the relevant metadata translators loaded.
198  Instrument.importAll(self.butler.registry)
199 
200  @classmethod
201  # WARNING: this method hardcodes the parameters to pipe.base.Task.__init__.
202  # Nobody seems to know a way to delegate them to Task code.
203  def _makeTask(cls, config: RawIngestConfig, butler: Butler, name: str, parentTask: Task):
204  """Construct a RawIngestTask using only positional arguments.
205 
206  Parameters
207  ----------
208  All parameters are as for `RawIngestTask`.
209  """
210  return cls(config=config, butler=butler, name=name, parentTask=parentTask)
211 
212  # Overrides Task.__reduce__
213  def __reduce__(self):
214  return (self._makeTask, (self.config, self.butler, self._name, self._parentTask))
215 
216  def extractMetadata(self, filename: str) -> RawFileData:
217  """Extract and process metadata from a single raw file.
218 
219  Parameters
220  ----------
221  filename : `str`
222  Path to the file.
223 
224  Returns
225  -------
226  data : `RawFileData`
227  A structure containing the metadata extracted from the file,
228  as well as the original filename. All fields will be populated,
229  but the `RawFileData.dataId` attribute will be a minimal
230  (unexpanded) `DataCoordinate` instance.
231 
232  Notes
233  -----
234  Assumes that there is a single dataset associated with the given
235  file. Instruments using a single file to store multiple datasets
236  must implement their own version of this method.
237  """
238  # Manually merge the primary and "first data" headers here because we
239  # do not know in general if an input file has set INHERIT=T.
240  phdu = readMetadata(filename, 0)
241  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
242  fix_header(header)
243  datasets = [self._calculate_dataset_info(header, filename)]
244 
245  # The data model currently assumes that whilst multiple datasets
246  # can be associated with a single file, they must all share the
247  # same formatter.
248  instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry)
249  FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
250 
251  return RawFileData(datasets=datasets, filename=filename,
252  FormatterClass=FormatterClass,
253  instrumentClass=instrument)
254 
255  def _calculate_dataset_info(self, header, filename):
256  """Calculate a RawFileDatasetInfo from the supplied information.
257 
258  Parameters
259  ----------
260  header : `Mapping`
261  Header from the dataset.
262  filename : `str`
263  Filename to use for error messages.
264 
265  Returns
266  -------
267  dataset : `RawFileDatasetInfo`
268  The dataId, and observation information associated with this
269  dataset.
270  """
271  obsInfo = ObservationInfo(header)
272  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
273  exposure=obsInfo.exposure_id,
274  detector=obsInfo.detector_num,
275  universe=self.universe)
276  return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
277 
278  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
279  """Group an iterable of `RawFileData` by exposure.
280 
281  Parameters
282  ----------
283  files : iterable of `RawFileData`
284  File-level information to group.
285 
286  Returns
287  -------
288  exposures : `list` of `RawExposureData`
289  A list of structures that group the file-level information by
290  exposure. All fields will be populated. The
291  `RawExposureData.dataId` attributes will be minimal (unexpanded)
292  `DataCoordinate` instances.
293  """
294  exposureDimensions = self.universe["exposure"].graph
295  byExposure = defaultdict(list)
296  for f in files:
297  # Assume that the first dataset is representative for the file
298  byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
299 
300  return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
301  for dataId, exposureFiles in byExposure.items()]
302 
303  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
304  """Expand the data IDs associated with a raw exposure to include
305  additional metadata records.
306 
307  Parameters
308  ----------
309  exposure : `RawExposureData`
310  A structure containing information about the exposure to be
311  ingested. Must have `RawExposureData.records` populated. Should
312  be considered consumed upon return.
313 
314  Returns
315  -------
316  exposure : `RawExposureData`
317  An updated version of the input structure, with
318  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
319  updated to data IDs for which `DataCoordinate.hasRecords` returns
320  `True`.
321  """
322  # We start by expanded the exposure-level data ID; we won't use that
323  # directly in file ingest, but this lets us do some database lookups
324  # once per exposure instead of once per file later.
325  data.dataId = self.butler.registry.expandDataId(
326  data.dataId,
327  # We pass in the records we'll be inserting shortly so they aren't
328  # looked up from the database. We do expect instrument and filter
329  # records to be retrieved from the database here (though the
330  # Registry may cache them so there isn't a lookup every time).
331  records={
332  self.butler.registry.dimensions["exposure"]: data.record,
333  }
334  )
335  # Now we expand the per-file (exposure+detector) data IDs. This time
336  # we pass in the records we just retrieved from the exposure data ID
337  # expansion.
338  for file in data.files:
339  for dataset in file.datasets:
340  dataset.dataId = self.butler.registry.expandDataId(
341  dataset.dataId,
342  records=dict(data.dataId.records)
343  )
344  return data
345 
346  def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
347  """Perform all ingest preprocessing steps that do not involve actually
348  modifying the database.
349 
350  Parameters
351  ----------
352  files : iterable over `str` or path-like objects
353  Paths to the files to be ingested. Will be made absolute
354  if they are not already.
355  pool : `multiprocessing.Pool`, optional
356  If not `None`, a process pool with which to parallelize some
357  operations.
358  processes : `int`, optional
359  The number of processes to use. Ignored if ``pool`` is not `None`.
360 
361  Yields
362  ------
363  exposure : `RawExposureData`
364  Data structures containing dimension records, filenames, and data
365  IDs to be ingested (one structure for each exposure).
366  """
367  if pool is None and processes > 1:
368  pool = Pool(processes)
369  mapFunc = map if pool is None else pool.imap_unordered
370 
371  # Extract metadata and build per-detector regions.
372  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
373 
374  # Use that metadata to group files (and extracted metadata) by
375  # exposure. Never parallelized because it's intrinsically a gather
376  # step.
377  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
378 
379  # The next operation operates on RawExposureData instances (one at
380  # a time) in-place and then returns the modified instance. We call it
381  # as a pass-through instead of relying on the arguments we pass in to
382  # have been modified because in the parallel case those arguments are
383  # going to be pickled and unpickled, and I'm not certain
384  # multiprocessing is careful enough with that for output arguments to
385  # work.
386 
387  # Expand the data IDs to include all dimension metadata; we need this
388  # because we may need to generate path templates that rely on that
389  # metadata.
390  # This is the first step that involves actual database calls (but just
391  # SELECTs), so if there's going to be a problem with connections vs.
392  # multiple processes, or lock contention (in SQLite) slowing things
393  # down, it'll happen here.
394  return mapFunc(self.expandDataIds, exposureData)
395 
396  def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
397  ) -> List[DatasetRef]:
398  """Ingest all raw files in one exposure.
399 
400  Parameters
401  ----------
402  exposure : `RawExposureData`
403  A structure containing information about the exposure to be
404  ingested. Must have `RawExposureData.records` populated and all
405  data ID attributes expanded.
406  run : `str`, optional
407  Name of a RUN-type collection to write to, overriding
408  ``self.butler.run``.
409 
410  Returns
411  -------
412  refs : `list` of `lsst.daf.butler.DatasetRef`
413  Dataset references for ingested raws.
414  """
415  datasets = [FileDataset(path=os.path.abspath(file.filename),
416  refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
417  formatter=file.FormatterClass)
418  for file in exposure.files]
419  self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
420  return [ref for dataset in datasets for ref in dataset.refs]
421 
422  def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None):
423  """Ingest files into a Butler data repository.
424 
425  This creates any new exposure or visit Dimension entries needed to
426  identify the ingested files, creates new Dataset entries in the
427  Registry and finally ingests the files themselves into the Datastore.
428  Any needed instrument, detector, and physical_filter Dimension entries
429  must exist in the Registry before `run` is called.
430 
431  Parameters
432  ----------
433  files : iterable over `str` or path-like objects
434  Paths to the files to be ingested. Will be made absolute
435  if they are not already.
436  pool : `multiprocessing.Pool`, optional
437  If not `None`, a process pool with which to parallelize some
438  operations.
439  processes : `int`, optional
440  The number of processes to use. Ignored if ``pool`` is not `None`.
441  run : `str`, optional
442  Name of a RUN-type collection to write to, overriding
443  the default derived from the instrument name.
444 
445  Returns
446  -------
447  refs : `list` of `lsst.daf.butler.DatasetRef`
448  Dataset references for ingested raws.
449 
450  Notes
451  -----
452  This method inserts all datasets for an exposure within a transaction,
453  guaranteeing that partial exposures are never ingested. The exposure
454  dimension record is inserted with `Registry.syncDimensionData` first
455  (in its own transaction), which inserts only if a record with the same
456  primary key does not already exist. This allows different files within
457  the same exposure to be incremented in different runs.
458  """
459  exposureData = self.prep(files, pool=pool, processes=processes)
460  # Up to this point, we haven't modified the data repository at all.
461  # Now we finally do that, with one transaction per exposure. This is
462  # not parallelized at present because the performance of this step is
463  # limited by the database server. That may or may not change in the
464  # future once we increase our usage of bulk inserts and reduce our
465  # usage of savepoints; we've tried to get everything but the database
466  # operations done in advance to reduce the time spent inside
467  # transactions.
468  self.butler.registry.registerDatasetType(self.datasetType)
469  refs = []
470  runs = set()
471  for exposure in exposureData:
472  self.butler.registry.syncDimensionData("exposure", exposure.record)
473  # Override default run if nothing specified explicitly
474  if run is None:
475  instrumentClass = exposure.files[0].instrumentClass
476  this_run = instrumentClass.makeDefaultRawIngestRunName()
477  else:
478  this_run = run
479  if this_run not in runs:
480  self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
481  runs.add(this_run)
482  with self.butler.transaction():
483  refs.extend(self.ingestExposureDatasets(exposure, run=this_run))
484  return refs
lsst.obs.base.ingest.RawExposureData.__post_init__
def __post_init__(self, DimensionUniverse universe)
Definition: ingest.py:115
lsst.obs.base.ingest.RawIngestTask.extractMetadata
RawFileData extractMetadata(self, str filename)
Definition: ingest.py:216
lsst.obs.base.ingest.RawFileDatasetInfo
Definition: ingest.py:51
lsst.obs.base.ingest.RawIngestTask.butler
butler
Definition: ingest.py:192
lsst.obs.base.ingest.RawIngestTask.groupByExposure
List[RawExposureData] groupByExposure(self, Iterable[RawFileData] files)
Definition: ingest.py:278
lsst.obs.base.ingest.RawExposureData
Definition: ingest.py:93
lsst.obs.base.ingest.RawIngestTask.ingestExposureDatasets
List[DatasetRef] ingestExposureDatasets(self, RawExposureData exposure, *Optional[str] run=None)
Definition: ingest.py:396
lsst.obs.base.ingest.RawIngestConfig
Definition: ingest.py:154
lsst.obs.base.ingest.RawIngestTask.__init__
def __init__(self, Optional[RawIngestConfig] config=None, *Butler butler, **Any kwargs)
Definition: ingest.py:189
lsst.obs.base.ingest.RawExposureData.record
record
Definition: ingest.py:118
lsst.obs.base.ingest.makeTransferChoiceField
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto")
Definition: ingest.py:121
lsst.obs.base.ingest.RawFileData
Definition: ingest.py:67
lsst.obs.base._instrument.makeExposureRecordFromObsInfo
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: _instrument.py:543
lsst.obs.base.ingest.RawIngestTask._makeTask
def _makeTask(cls, RawIngestConfig config, Butler butler, str name, Task parentTask)
Definition: ingest.py:203
lsst.obs.base.ingest.RawIngestTask.prep
Iterator[RawExposureData] prep(self, files, *Optional[Pool] pool=None, int processes=1)
Definition: ingest.py:346
lsst.obs.base.ingest.RawIngestTask.run
def run(self, files, *Optional[Pool] pool=None, int processes=1, Optional[str] run=None)
Definition: ingest.py:422
lsst.obs.base.ingest.RawIngestTask.getDatasetType
def getDatasetType(self)
Definition: ingest.py:183
lsst.obs.base.ingest.RawIngestTask.universe
universe
Definition: ingest.py:193
lsst.obs.base.ingest.RawIngestTask.expandDataIds
RawExposureData expandDataIds(self, RawExposureData data)
Definition: ingest.py:303
lsst.obs.base.ingest.RawIngestTask
Definition: ingest.py:158
lsst.obs.base.ingest.RawIngestTask.__reduce__
def __reduce__(self)
Definition: ingest.py:213
lsst.obs.base.ingest.RawIngestTask.datasetType
datasetType
Definition: ingest.py:194
lsst.obs.base.ingest.RawIngestTask._calculate_dataset_info
def _calculate_dataset_info(self, header, filename)
Definition: ingest.py:255