lsst.obs.base  19.0.0-36-g22095ce
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 import itertools
27 from dataclasses import dataclass
28 from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections import defaultdict
30 from multiprocessing import Pool
31 
32 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33 from lsst.utils import doImport
34 from lsst.afw.fits import readMetadata
35 from lsst.daf.butler import (
36  Butler,
37  DataCoordinate,
38  DatasetRef,
39  DatasetType,
40  DimensionRecord,
41  FileDataset,
42 )
43 from lsst.geom import Box2D
44 from lsst.pex.config import Config, Field, ChoiceField
45 from lsst.pipe.base import Task
46 from lsst.sphgeom import ConvexPolygon
47 
48 from .instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
49 from .fitsRawFormatterBase import FitsRawFormatterBase
50 
51 
52 @dataclass
54  """Structure that hold information about a single dataset within a
55  raw file.
56  """
57 
58  dataId: DataCoordinate
59  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
60 
61  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63  """
64 
65  obsInfo: ObservationInfo
66  """Standardized observation metadata extracted directly from the file
67  headers (`astro_metadata_translator.ObservationInfo`).
68  """
69 
70  region: ConvexPolygon
71  """Region on the sky covered by this file, possibly with padding
72  (`lsst.sphgeom.ConvexPolygon`).
73  """
74 
75 
76 @dataclass
78  """Structure that holds information about a single raw file, used during
79  ingest.
80  """
81 
82  datasets: List[RawFileDatasetInfo]
83  """The information describing each dataset within this raw file.
84  (`list` of `RawFileDatasetInfo`)
85  """
86 
87  filename: str
88  """Name of the file this information was extracted from (`str`).
89 
90  This is the path prior to ingest, not the path after ingest.
91  """
92 
93  FormatterClass: Type[FitsRawFormatterBase]
94  """Formatter class that should be used to ingest this file and compute
95  a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
96  """
97 
98 
99 @dataclass
101  """Structure that holds information about a complete raw exposure, used
102  during ingest.
103  """
104 
105  dataId: DataCoordinate
106  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
107 
108  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
109  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
110  """
111 
112  files: List[RawFileData]
113  """List of structures containing file-level information.
114  """
115 
116  records: Optional[Dict[str, List[DimensionRecord]]] = None
117  """Dictionary containing `DimensionRecord` instances that must be inserted
118  into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
119 
120  Keys are the names of dimension elements ("exposure" and optionally "visit"
121  and "visit_detector_region"), while values are lists of `DimensionRecord`.
122 
123  May be `None` during some ingest steps.
124  """
125 
126 
127 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
128  """Create a Config field with options for how to transfer files between
129  data repositories.
130 
131  The allowed options for the field are exactly those supported by
132  `lsst.daf.butler.Datastore.ingest`.
133 
134  Parameters
135  ----------
136  doc : `str`
137  Documentation for the configuration field.
138 
139  Returns
140  -------
141  field : `lsst.pex.config.ChoiceField`
142  Configuration field.
143  """
144  return ChoiceField(
145  doc=doc,
146  dtype=str,
147  allowed={"move": "move",
148  "copy": "copy",
149  "auto": "choice will depend on datastore",
150  "link": "hard link falling back to symbolic link",
151  "hardlink": "hard link",
152  "symlink": "symbolic (soft) link"},
153  optional=True,
154  default=default
155  )
156 
157 
158 class RawIngestConfig(Config):
160  padRegionAmount = Field(
161  dtype=int,
162  default=0,
163  doc="Pad an image with specified number of pixels before calculating region"
164  )
165  instrument = Field(
166  doc=("Fully-qualified Python name of the `Instrument` subclass to "
167  "associate with all raws."),
168  dtype=str,
169  optional=False,
170  default=None,
171  )
172 
173 
174 class RawIngestTask(Task):
175  """Driver Task for ingesting raw data into Gen3 Butler repositories.
176 
177  This Task is intended to be runnable from the command-line, but it doesn't
178  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
179  gain much from being one. It also wouldn't really be appropriate as a
180  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
181  leverage the logging and configurability functionality that provides.
182 
183  Each instance of `RawIngestTask` writes to the same Butler. Each
184  invocation of `RawIngestTask.run` ingests a list of files.
185 
186  Parameters
187  ----------
188  config : `RawIngestConfig`
189  Configuration for the task.
190  butler : `~lsst.daf.butler.Butler`
191  Butler instance. Ingested Datasets will be created as part of
192  ``butler.run`` and associated with its Collection.
193  kwds
194  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
195  constructor.
196 
197  Other keyword arguments are forwarded to the Task base class constructor.
198  """
199 
200  ConfigClass = RawIngestConfig
201 
202  _DefaultName = "ingest"
203 
204  def getDatasetType(self):
205  """Return the DatasetType of the Datasets ingested by this Task.
206  """
207  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
208  universe=self.butler.registry.dimensions)
209 
210  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
211  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
212  super().__init__(config, **kwds)
213  self.butler = butler
214  self.universe = self.butler.registry.dimensions
215  self.instrument = doImport(self.config.instrument)()
216  # For now, we get a nominal Camera from the Instrument.
217  # In the future, we may want to load one from a Butler calibration
218  # collection that's appropriate for the observation timestamp of
219  # the exposure.
220  self.camera = self.instrument.getCamera()
222 
223  def extractMetadata(self, filename: str) -> RawFileData:
224  """Extract and process metadata from a single raw file.
225 
226  Parameters
227  ----------
228  filename : `str`
229  Path to the file.
230 
231  Returns
232  -------
233  data : `RawFileData`
234  A structure containing the metadata extracted from the file,
235  as well as the original filename. All fields will be populated,
236  but the `RawFileData.dataId` attribute will be a minimal
237  (unexpanded) `DataCoordinate` instance.
238 
239  Notes
240  -----
241  Assumes that there is a single dataset associated with the given
242  file. Instruments using a single file to store multiple datasets
243  must implement their own version of this method.
244  """
245  # Manually merge the primary and "first data" headers here because we
246  # do not know in general if an input file has set INHERIT=T.
247  phdu = readMetadata(filename, 0)
248  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
249  fix_header(header)
250  datasets = [self._calculate_dataset_info(header, filename)]
251 
252  # The data model currently assumes that whilst multiple datasets
253  # can be associated with a single file, they must all share the
254  # same formatter.
255  FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId)
256 
257  return RawFileData(datasets=datasets, filename=filename,
258  FormatterClass=FormatterClass)
259 
260  def _calculate_dataset_info(self, header, filename):
261  """Calculate a RawFileDatasetInfo from the supplied information.
262 
263  Parameters
264  ----------
265  header : `Mapping`
266  Header from the dataset.
267  filename : `str`
268  Filename to use for error messages.
269 
270  Returns
271  -------
272  dataset : `RawFileDatasetInfo`
273  The region, dataId, and observation information associated with
274  this dataset.
275  """
276  obsInfo = ObservationInfo(header)
277  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
278  exposure=obsInfo.exposure_id,
279  detector=obsInfo.detector_num,
280  universe=self.universe)
281  if obsInfo.instrument != self.instrument.getName():
282  raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
283  f"got {obsInfo.instrument}) for file {filename}.")
284 
285  FormatterClass = self.instrument.getRawFormatter(dataId)
286  region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass)
287  return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)
288 
289  def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
290  """Calculate the sky region covered by the supplied observation
291  information.
292 
293  Parameters
294  ----------
295  obsInfo : `~astro_metadata_translator.ObservationInfo`
296  Summary information of this dataset.
297  header : `Mapping`
298  Header from the dataset.
299  FormatterClass: `type` as subclass of `FitsRawFormatterBase`
300  Formatter class that should be used to compute the spatial region.
301 
302  Returns
303  -------
304  region : `lsst.sphgeom.ConvexPolygon`
305  Region of sky covered by this observation.
306  """
307  if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
308  formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
309  visitInfo = formatter.makeVisitInfo()
310  detector = self.camera[obsInfo.detector_num]
311  wcs = formatter.makeWcs(visitInfo, detector)
312  pixBox = Box2D(detector.getBBox())
313  if self.config.padRegionAmount > 0:
314  pixBox.grow(self.config.padRegionAmount)
315  pixCorners = pixBox.getCorners()
316  sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
317  region = ConvexPolygon(sphCorners)
318  else:
319  region = None
320  return region
321 
322  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
323  """Group an iterable of `RawFileData` by exposure.
324 
325  Parameters
326  ----------
327  files : iterable of `RawFileData`
328  File-level information to group.
329 
330  Returns
331  -------
332  exposures : `list` of `RawExposureData`
333  A list of structures that group the file-level information by
334  exposure. The `RawExposureData.records` attributes of elements
335  will be `None`, but all other fields will be populated. The
336  `RawExposureData.dataId` attributes will be minimal (unexpanded)
337  `DataCoordinate` instances.
338  """
339  exposureDimensions = self.universe["exposure"].graph
340  byExposure = defaultdict(list)
341  for f in files:
342  # Assume that the first dataset is representative for the file
343  byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
344 
345  return [RawExposureData(dataId=dataId, files=exposureFiles)
346  for dataId, exposureFiles in byExposure.items()]
347 
348  def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
349  """Collect the `DimensionRecord` instances that must be inserted into
350  the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
351 
352  Parameters
353  ----------
354  exposure : `RawExposureData`
355  A structure containing information about the exposure to be
356  ingested. Should be considered consumed upon return.
357 
358  Returns
359  -------
360  exposure : `RawExposureData`
361  An updated version of the input structure, with
362  `RawExposureData.records` populated.
363  """
364  firstFile = exposure.files[0]
365  firstDataset = firstFile.datasets[0]
366  VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
367  exposure.records = {
368  "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)],
369  }
370  if firstDataset.obsInfo.visit_id is not None:
371  exposure.records["visit_detector_region"] = []
372  visitVertices = []
373  for file in exposure.files:
374  for dataset in file.datasets:
375  if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
376  raise ValueError(f"Inconsistent visit/exposure relationship for "
377  f"exposure {firstDataset.obsInfo.exposure_id} between "
378  f"{file.filename} and {firstFile.filename}: "
379  f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
380  if dataset.region is None:
381  self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
382  dataset.obsInfo.detector_num)
383  continue
384  visitVertices.extend(dataset.region.getVertices())
385  exposure.records["visit_detector_region"].append(
386  VisitDetectorRegionRecordClass.fromDict({
387  "instrument": dataset.obsInfo.instrument,
388  "visit": dataset.obsInfo.visit_id,
389  "detector": dataset.obsInfo.detector_num,
390  "region": dataset.region,
391  })
392  )
393  if visitVertices:
394  visitRegion = ConvexPolygon(visitVertices)
395  else:
396  self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id)
397  visitRegion = None
398  exposure.records["visit"] = [
399  makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion)
400  ]
401  return exposure
402 
403  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
404  """Expand the data IDs associated with a raw exposure to include
405  additional metadata records.
406 
407  Parameters
408  ----------
409  exposure : `RawExposureData`
410  A structure containing information about the exposure to be
411  ingested. Must have `RawExposureData.records` populated. Should
412  be considered consumed upon return.
413 
414  Returns
415  -------
416  exposure : `RawExposureData`
417  An updated version of the input structure, with
418  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
419  containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
420  """
421  hasVisit = "visit" in data.records
422  # We start by expanded the exposure-level data ID; we won't use that
423  # directly in file ingest, but this lets us do some database lookups
424  # once per exposure instead of once per file later.
425  data.dataId = self.butler.registry.expandDataId(
426  data.dataId,
427  # We pass in the records we'll be inserting shortly so they aren't
428  # looked up from the database. We do expect instrument and filter
429  # records to be retrieved from the database here (though the
430  # Registry may cache them so there isn't a lookup every time).
431  records={
432  "exposure": data.records["exposure"][0],
433  "visit": data.records["visit"][0] if hasVisit else None,
434  }
435  )
436  # Now we expand the per-file (exposure+detector) data IDs. This time
437  # we pass in the records we just retrieved from the exposure data ID
438  # expansion as well as the visit_detector_region record, if there is
439  # one.
440  vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
441  for file, vdrRecord in zip(data.files, vdrRecords):
442  for dataset in file.datasets:
443  dataset.dataId = self.butler.registry.expandDataId(
444  dataset.dataId,
445  records=dict(data.dataId.records, visit_detector_region=vdrRecord)
446  )
447  return data
448 
449  def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
450  """Perform all ingest preprocessing steps that do not involve actually
451  modifying the database.
452 
453  Parameters
454  ----------
455  files : iterable over `str` or path-like objects
456  Paths to the files to be ingested. Will be made absolute
457  if they are not already.
458  pool : `multiprocessing.Pool`, optional
459  If not `None`, a process pool with which to parallelize some
460  operations.
461  processes : `int`, optional
462  The number of processes to use. Ignored if ``pool`` is not `None`.
463 
464  Yields
465  ------
466  exposure : `RawExposureData`
467  Data structures containing dimension records, filenames, and data
468  IDs to be ingested (one structure for each exposure).
469  """
470  if pool is None and processes > 1:
471  pool = Pool(processes)
472  mapFunc = map if pool is None else pool.imap_unordered
473 
474  # Extract metadata and build per-detector regions.
475  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
476 
477  # Use that metadata to group files (and extracted metadata) by
478  # exposure. Never parallelized because it's intrinsically a gather
479  # step.
480  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
481 
482  # The next few operations operate on RawExposureData instances (one at
483  # a time) in-place and then return the modified instance. We call them
484  # as pass-throughs instead of relying on the arguments we pass in to
485  # have been modified because in the parallel case those arguments are
486  # going to be pickled and unpickled, and I'm not certain
487  # multiprocessing is careful enough with that for output arguments to
488  # work. We use the same variable names to reflect the fact that we
489  # consider the arguments to have been consumed/invalidated.
490 
491  # Extract DimensionRecords from the metadata that will need to be
492  # inserted into the Registry before the raw datasets themselves are
493  # ingested.
494  exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
495 
496  # Expand the data IDs to include all dimension metadata; we need this
497  # because we may need to generate path templates that rely on that
498  # metadata.
499  # This is the first step that involves actual database calls (but just
500  # SELECTs), so if there's going to be a problem with connections vs.
501  # multiple processes, or lock contention (in SQLite) slowing things
502  # down, it'll happen here.
503  return mapFunc(self.expandDataIds, exposureData)
504 
505  def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
506  """Insert dimension records for one or more exposures.
507 
508  Parameters
509  ----------
510  records : `dict` mapping `str` to `list`
511  Dimension records to be inserted, organized as a mapping from
512  dimension name to a list of records for that dimension. This
513  may be a single `RawExposureData.records` dict, or an aggregate
514  for multiple exposures created by concatenating the value lists
515  of those dictionaries.
516 
517  Returns
518  -------
519  refs : `list` of `lsst.daf.butler.DatasetRef`
520  Dataset references for ingested raws.
521  """
522  # TODO: This currently assumes that either duplicate inserts of
523  # visit records are ignored, or there is exactly one visit per
524  # exposure. I expect us to switch up the visit-exposure
525  # relationship and hence rewrite some of this code before that
526  # becomes a practical problem.
527  # Iterate over dimensions explicitly to order for foreign key
528  # relationships.
529  for dimension in ("visit", "exposure", "visit_detector_region"):
530  recordsForDimension = records.get(dimension)
531  if recordsForDimension:
532  # TODO: once Registry has options to ignore or replace
533  # existing dimension records with the same primary keys
534  # instead of aborting on conflicts, add configuration
535  # options and logic to use them.
536  self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
537 
538  def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
539  ) -> List[DatasetRef]:
540  """Ingest all raw files in one exposure.
541 
542  Parameters
543  ----------
544  exposure : `RawExposureData`
545  A structure containing information about the exposure to be
546  ingested. Must have `RawExposureData.records` populated and all
547  data ID attributes expanded.
548  butler : `lsst.daf.butler.Butler`, optional
549  Butler to use for ingest. If not provided, ``self.butler`` will
550  be used.
551 
552  Returns
553  -------
554  refs : `list` of `lsst.daf.butler.DatasetRef`
555  Dataset references for ingested raws.
556  """
557  if butler is None:
558  butler = self.butler
559  datasets = [FileDataset(path=os.path.abspath(file.filename),
560  refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
561  formatter=file.FormatterClass)
562  for file in exposure.files]
563  butler.ingest(*datasets, transfer=self.config.transfer)
564  return [ref for dataset in datasets for ref in dataset.refs]
565 
566  def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
567  """Ingest files into a Butler data repository.
568 
569  This creates any new exposure or visit Dimension entries needed to
570  identify the ingested files, creates new Dataset entries in the
571  Registry and finally ingests the files themselves into the Datastore.
572  Any needed instrument, detector, and physical_filter Dimension entries
573  must exist in the Registry before `run` is called.
574 
575  Parameters
576  ----------
577  files : iterable over `str` or path-like objects
578  Paths to the files to be ingested. Will be made absolute
579  if they are not already.
580  pool : `multiprocessing.Pool`, optional
581  If not `None`, a process pool with which to parallelize some
582  operations.
583  processes : `int`, optional
584  The number of processes to use. Ignored if ``pool`` is not `None`.
585 
586  Returns
587  -------
588  refs : `list` of `lsst.daf.butler.DatasetRef`
589  Dataset references for ingested raws.
590 
591  Notes
592  -----
593  This method inserts all records (dimensions and datasets) for an
594  exposure within a transaction, guaranteeing that partial exposures
595  are never ingested.
596  """
597  exposureData = self.prep(files, pool=pool, processes=processes)
598  # Up to this point, we haven't modified the data repository at all.
599  # Now we finally do that, with one transaction per exposure. This is
600  # not parallelized at present because the performance of this step is
601  # limited by the database server. That may or may not change in the
602  # future once we increase our usage of bulk inserts and reduce our
603  # usage of savepoints; we've tried to get everything but the database
604  # operations done in advance to reduce the time spent inside
605  # transactions.
606  self.butler.registry.registerDatasetType(self.datasetType)
607  refs = []
608  for exposure in exposureData:
609  with self.butler.transaction():
610  self.insertDimensionData(exposure.records)
611  refs.extend(self.ingestExposureDatasets(exposure))
612  return refs
def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass)
Definition: ingest.py:289
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: instrument.py:303
def _calculate_dataset_info(self, header, filename)
Definition: ingest.py:260
def makeVisitRecordFromObsInfo(obsInfo, universe, region=None)
Definition: instrument.py:337
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:127