lsst.obs.base  20.0.0-15-g34741e2+3
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 from dataclasses import dataclass, InitVar
27 from typing import List, Iterator, Iterable, Type, Optional, Any
28 from collections import defaultdict
29 from multiprocessing import Pool
30 
31 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
32 from lsst.afw.fits import readMetadata
33 from lsst.daf.butler import (
34  Butler,
35  CollectionType,
36  DataCoordinate,
37  DatasetRef,
38  DatasetType,
39  DimensionRecord,
40  DimensionUniverse,
41  FileDataset,
42 )
43 from lsst.pex.config import Config, ChoiceField
44 from lsst.pipe.base import Task
45 
46 from ._instrument import Instrument, makeExposureRecordFromObsInfo
47 from ._fitsRawFormatterBase import FitsRawFormatterBase
48 
49 
50 @dataclass
52  """Structure that holds information about a single dataset within a
53  raw file.
54  """
55 
56  dataId: DataCoordinate
57  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
58 
59  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
60  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
61  """
62 
63  obsInfo: ObservationInfo
64  """Standardized observation metadata extracted directly from the file
65  headers (`astro_metadata_translator.ObservationInfo`).
66  """
67 
68 
69 @dataclass
71  """Structure that holds information about a single raw file, used during
72  ingest.
73  """
74 
75  datasets: List[RawFileDatasetInfo]
76  """The information describing each dataset within this raw file.
77  (`list` of `RawFileDatasetInfo`)
78  """
79 
80  filename: str
81  """Name of the file this information was extracted from (`str`).
82 
83  This is the path prior to ingest, not the path after ingest.
84  """
85 
86  FormatterClass: Type[FitsRawFormatterBase]
87  """Formatter class that should be used to ingest this file (`type`; as
88  subclass of `FitsRawFormatterBase`).
89  """
90 
91  instrumentClass: Type[Instrument]
92  """The `Instrument` class associated with this file."""
93 
94 
95 @dataclass
97  """Structure that holds information about a complete raw exposure, used
98  during ingest.
99  """
100 
101  dataId: DataCoordinate
102  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
103 
104  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
105  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
106  """
107 
108  files: List[RawFileData]
109  """List of structures containing file-level information.
110  """
111 
112  universe: InitVar[DimensionUniverse]
113  """Set of all known dimensions.
114  """
115 
116  record: Optional[DimensionRecord] = None
117  """The exposure `DimensionRecord` that must be inserted into the
118  `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
119  """
120 
121  def __post_init__(self, universe: DimensionUniverse):
122  # We don't care which file or dataset we read metadata from, because
123  # we're assuming they'll all be the same; just use the first ones.
124  self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
125 
126 
127 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
128  """Create a Config field with options for how to transfer files between
129  data repositories.
130 
131  The allowed options for the field are exactly those supported by
132  `lsst.daf.butler.Datastore.ingest`.
133 
134  Parameters
135  ----------
136  doc : `str`
137  Documentation for the configuration field.
138 
139  Returns
140  -------
141  field : `lsst.pex.config.ChoiceField`
142  Configuration field.
143  """
144  return ChoiceField(
145  doc=doc,
146  dtype=str,
147  allowed={"move": "move",
148  "copy": "copy",
149  "auto": "choice will depend on datastore",
150  "link": "hard link falling back to symbolic link",
151  "hardlink": "hard link",
152  "symlink": "symbolic (soft) link",
153  "relsymlink": "relative symbolic link",
154  },
155  optional=True,
156  default=default
157  )
158 
159 
160 class RawIngestConfig(Config):
162 
163 
164 class RawIngestTask(Task):
165  """Driver Task for ingesting raw data into Gen3 Butler repositories.
166 
167  Parameters
168  ----------
169  config : `RawIngestConfig`
170  Configuration for the task.
171  butler : `~lsst.daf.butler.Butler`
172  Writeable butler instance, with ``butler.run`` set to the appropriate
173  `~lsst.daf.butler.CollectionType.RUN` collection for these raw
174  datasets.
175  **kwargs
176  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
177  constructor.
178 
179  Notes
180  -----
181  Each instance of `RawIngestTask` writes to the same Butler. Each
182  invocation of `RawIngestTask.run` ingests a list of files.
183  """
184 
185  ConfigClass = RawIngestConfig
186 
187  _DefaultName = "ingest"
188 
189  def getDatasetType(self):
190  """Return the DatasetType of the datasets ingested by this Task.
191  """
192  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
193  universe=self.butler.registry.dimensions)
194 
195  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any):
196  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
197  super().__init__(config, **kwargs)
198  self.butler = butler
199  self.universe = self.butler.registry.dimensions
201 
202  # Import all the instrument classes so that we ensure that we
203  # have all the relevant metadata translators loaded.
204  Instrument.importAll(self.butler.registry)
205 
206  def extractMetadata(self, filename: str) -> RawFileData:
207  """Extract and process metadata from a single raw file.
208 
209  Parameters
210  ----------
211  filename : `str`
212  Path to the file.
213 
214  Returns
215  -------
216  data : `RawFileData`
217  A structure containing the metadata extracted from the file,
218  as well as the original filename. All fields will be populated,
219  but the `RawFileData.dataId` attribute will be a minimal
220  (unexpanded) `DataCoordinate` instance.
221 
222  Notes
223  -----
224  Assumes that there is a single dataset associated with the given
225  file. Instruments using a single file to store multiple datasets
226  must implement their own version of this method.
227  """
228  # Manually merge the primary and "first data" headers here because we
229  # do not know in general if an input file has set INHERIT=T.
230  phdu = readMetadata(filename, 0)
231  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
232  fix_header(header)
233  datasets = [self._calculate_dataset_info(header, filename)]
234 
235  # The data model currently assumes that whilst multiple datasets
236  # can be associated with a single file, they must all share the
237  # same formatter.
238  instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry)
239  FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
240 
241  return RawFileData(datasets=datasets, filename=filename,
242  FormatterClass=FormatterClass,
243  instrumentClass=instrument)
244 
245  def _calculate_dataset_info(self, header, filename):
246  """Calculate a RawFileDatasetInfo from the supplied information.
247 
248  Parameters
249  ----------
250  header : `Mapping`
251  Header from the dataset.
252  filename : `str`
253  Filename to use for error messages.
254 
255  Returns
256  -------
257  dataset : `RawFileDatasetInfo`
258  The dataId, and observation information associated with this
259  dataset.
260  """
261  obsInfo = ObservationInfo(header)
262  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
263  exposure=obsInfo.exposure_id,
264  detector=obsInfo.detector_num,
265  universe=self.universe)
266  return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
267 
268  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
269  """Group an iterable of `RawFileData` by exposure.
270 
271  Parameters
272  ----------
273  files : iterable of `RawFileData`
274  File-level information to group.
275 
276  Returns
277  -------
278  exposures : `list` of `RawExposureData`
279  A list of structures that group the file-level information by
280  exposure. All fields will be populated. The
281  `RawExposureData.dataId` attributes will be minimal (unexpanded)
282  `DataCoordinate` instances.
283  """
284  exposureDimensions = self.universe["exposure"].graph
285  byExposure = defaultdict(list)
286  for f in files:
287  # Assume that the first dataset is representative for the file
288  byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
289 
290  return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
291  for dataId, exposureFiles in byExposure.items()]
292 
293  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
294  """Expand the data IDs associated with a raw exposure to include
295  additional metadata records.
296 
297  Parameters
298  ----------
299  exposure : `RawExposureData`
300  A structure containing information about the exposure to be
301  ingested. Must have `RawExposureData.records` populated. Should
302  be considered consumed upon return.
303 
304  Returns
305  -------
306  exposure : `RawExposureData`
307  An updated version of the input structure, with
308  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
309  containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
310  """
311  # We start by expanded the exposure-level data ID; we won't use that
312  # directly in file ingest, but this lets us do some database lookups
313  # once per exposure instead of once per file later.
314  data.dataId = self.butler.registry.expandDataId(
315  data.dataId,
316  # We pass in the records we'll be inserting shortly so they aren't
317  # looked up from the database. We do expect instrument and filter
318  # records to be retrieved from the database here (though the
319  # Registry may cache them so there isn't a lookup every time).
320  records={
321  self.butler.registry.dimensions["exposure"]: data.record,
322  }
323  )
324  # Now we expand the per-file (exposure+detector) data IDs. This time
325  # we pass in the records we just retrieved from the exposure data ID
326  # expansion.
327  for file in data.files:
328  for dataset in file.datasets:
329  dataset.dataId = self.butler.registry.expandDataId(
330  dataset.dataId,
331  records=dict(data.dataId.records)
332  )
333  return data
334 
335  def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
336  """Perform all ingest preprocessing steps that do not involve actually
337  modifying the database.
338 
339  Parameters
340  ----------
341  files : iterable over `str` or path-like objects
342  Paths to the files to be ingested. Will be made absolute
343  if they are not already.
344  pool : `multiprocessing.Pool`, optional
345  If not `None`, a process pool with which to parallelize some
346  operations.
347  processes : `int`, optional
348  The number of processes to use. Ignored if ``pool`` is not `None`.
349 
350  Yields
351  ------
352  exposure : `RawExposureData`
353  Data structures containing dimension records, filenames, and data
354  IDs to be ingested (one structure for each exposure).
355  """
356  if pool is None and processes > 1:
357  pool = Pool(processes)
358  mapFunc = map if pool is None else pool.imap_unordered
359 
360  # Extract metadata and build per-detector regions.
361  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
362 
363  # Use that metadata to group files (and extracted metadata) by
364  # exposure. Never parallelized because it's intrinsically a gather
365  # step.
366  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
367 
368  # The next operation operates on RawExposureData instances (one at
369  # a time) in-place and then returns the modified instance. We call it
370  # as a pass-through instead of relying on the arguments we pass in to
371  # have been modified because in the parallel case those arguments are
372  # going to be pickled and unpickled, and I'm not certain
373  # multiprocessing is careful enough with that for output arguments to
374  # work.
375 
376  # Expand the data IDs to include all dimension metadata; we need this
377  # because we may need to generate path templates that rely on that
378  # metadata.
379  # This is the first step that involves actual database calls (but just
380  # SELECTs), so if there's going to be a problem with connections vs.
381  # multiple processes, or lock contention (in SQLite) slowing things
382  # down, it'll happen here.
383  return mapFunc(self.expandDataIds, exposureData)
384 
385  def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
386  ) -> List[DatasetRef]:
387  """Ingest all raw files in one exposure.
388 
389  Parameters
390  ----------
391  exposure : `RawExposureData`
392  A structure containing information about the exposure to be
393  ingested. Must have `RawExposureData.records` populated and all
394  data ID attributes expanded.
395  run : `str`, optional
396  Name of a RUN-type collection to write to, overriding
397  ``self.butler.run``.
398 
399  Returns
400  -------
401  refs : `list` of `lsst.daf.butler.DatasetRef`
402  Dataset references for ingested raws.
403  """
404  datasets = [FileDataset(path=os.path.abspath(file.filename),
405  refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
406  formatter=file.FormatterClass)
407  for file in exposure.files]
408  self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
409  return [ref for dataset in datasets for ref in dataset.refs]
410 
411  def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None):
412  """Ingest files into a Butler data repository.
413 
414  This creates any new exposure or visit Dimension entries needed to
415  identify the ingested files, creates new Dataset entries in the
416  Registry and finally ingests the files themselves into the Datastore.
417  Any needed instrument, detector, and physical_filter Dimension entries
418  must exist in the Registry before `run` is called.
419 
420  Parameters
421  ----------
422  files : iterable over `str` or path-like objects
423  Paths to the files to be ingested. Will be made absolute
424  if they are not already.
425  pool : `multiprocessing.Pool`, optional
426  If not `None`, a process pool with which to parallelize some
427  operations.
428  processes : `int`, optional
429  The number of processes to use. Ignored if ``pool`` is not `None`.
430  run : `str`, optional
431  Name of a RUN-type collection to write to, overriding
432  the default derived from the instrument name.
433 
434  Returns
435  -------
436  refs : `list` of `lsst.daf.butler.DatasetRef`
437  Dataset references for ingested raws.
438 
439  Notes
440  -----
441  This method inserts all datasets for an exposure within a transaction,
442  guaranteeing that partial exposures are never ingested. The exposure
443  dimension record is inserted with `Registry.syncDimensionData` first
444  (in its own transaction), which inserts only if a record with the same
445  primary key does not already exist. This allows different files within
446  the same exposure to be incremented in different runs.
447  """
448  exposureData = self.prep(files, pool=pool, processes=processes)
449  # Up to this point, we haven't modified the data repository at all.
450  # Now we finally do that, with one transaction per exposure. This is
451  # not parallelized at present because the performance of this step is
452  # limited by the database server. That may or may not change in the
453  # future once we increase our usage of bulk inserts and reduce our
454  # usage of savepoints; we've tried to get everything but the database
455  # operations done in advance to reduce the time spent inside
456  # transactions.
457  self.butler.registry.registerDatasetType(self.datasetType)
458  refs = []
459  runs = set()
460  for exposure in exposureData:
461  self.butler.registry.syncDimensionData("exposure", exposure.record)
462  # Override default run if nothing specified explicitly
463  if run is None:
464  instrumentClass = exposure.files[0].instrumentClass
465  this_run = instrumentClass.makeDefaultRawIngestRunName()
466  else:
467  this_run = run
468  if this_run not in runs:
469  self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
470  runs.add(this_run)
471  with self.butler.transaction():
472  refs.extend(self.ingestExposureDatasets(exposure, run=this_run))
473  return refs
lsst.obs.base.ingest.RawExposureData.__post_init__
def __post_init__(self, DimensionUniverse universe)
Definition: ingest.py:121
lsst.obs.base.ingest.RawIngestTask.extractMetadata
RawFileData extractMetadata(self, str filename)
Definition: ingest.py:206
lsst.obs.base.ingest.RawFileDatasetInfo
Definition: ingest.py:51
lsst.obs.base.ingest.RawIngestTask.butler
butler
Definition: ingest.py:198
lsst.obs.base.ingest.RawIngestTask.groupByExposure
List[RawExposureData] groupByExposure(self, Iterable[RawFileData] files)
Definition: ingest.py:268
lsst.obs.base.ingest.RawExposureData
Definition: ingest.py:96
lsst.obs.base.ingest.RawIngestTask.ingestExposureDatasets
List[DatasetRef] ingestExposureDatasets(self, RawExposureData exposure, *Optional[str] run=None)
Definition: ingest.py:385
lsst.obs.base.ingest.RawIngestConfig
Definition: ingest.py:160
lsst.obs.base.ingest.RawIngestTask.__init__
def __init__(self, Optional[RawIngestConfig] config=None, *Butler butler, **Any kwargs)
Definition: ingest.py:195
lsst.obs.base.ingest.RawExposureData.record
record
Definition: ingest.py:124
lsst.obs.base.ingest.RawFileData
Definition: ingest.py:70
lsst.obs.base._instrument.makeExposureRecordFromObsInfo
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: _instrument.py:477
lsst.obs.base.ingest.RawIngestTask.prep
Iterator[RawExposureData] prep(self, files, *Optional[Pool] pool=None, int processes=1)
Definition: ingest.py:335
lsst.obs.base.ingest.RawIngestTask.run
def run(self, files, *Optional[Pool] pool=None, int processes=1, Optional[str] run=None)
Definition: ingest.py:411
lsst.obs.base.ingest.RawIngestTask.getDatasetType
def getDatasetType(self)
Definition: ingest.py:189
lsst.obs.base.ingest.makeTransferChoiceField
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:127
lsst.obs.base.ingest.RawIngestTask.universe
universe
Definition: ingest.py:199
lsst.obs.base.ingest.RawIngestTask.expandDataIds
RawExposureData expandDataIds(self, RawExposureData data)
Definition: ingest.py:293
lsst.obs.base.ingest.RawIngestTask
Definition: ingest.py:164
lsst.obs.base.ingest.RawIngestTask.datasetType
datasetType
Definition: ingest.py:200
lsst.obs.base.ingest.RawIngestTask._calculate_dataset_info
def _calculate_dataset_info(self, header, filename)
Definition: ingest.py:245