lsst.obs.base  19.0.0-29-g0c92743
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 import itertools
27 from dataclasses import dataclass
28 from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections import defaultdict
30 from multiprocessing import Pool
31 
32 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33 from lsst.utils import doImport
34 from lsst.afw.fits import readMetadata
35 from lsst.daf.butler import (
36  Butler,
37  DataCoordinate,
38  DatasetRef,
39  DatasetType,
40  DimensionRecord,
41  FileDataset,
42 )
43 from lsst.obs.base.instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
44 from lsst.geom import Box2D
45 from lsst.pex.config import Config, Field, ChoiceField
46 from lsst.pipe.base import Task
47 from lsst.sphgeom import ConvexPolygon
48 
49 from .fitsRawFormatterBase import FitsRawFormatterBase
50 
51 
52 @dataclass
54  """Structure that hold information about a single dataset within a
55  raw file.
56  """
57 
58  dataId: DataCoordinate
59  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
60 
61  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63  """
64 
65  obsInfo: ObservationInfo
66  """Standardized observation metadata extracted directly from the file
67  headers (`astro_metadata_translator.ObservationInfo`).
68  """
69 
70  region: ConvexPolygon
71  """Region on the sky covered by this file, possibly with padding
72  (`lsst.sphgeom.ConvexPolygon`).
73  """
74 
75 
76 @dataclass
78  """Structure that holds information about a single raw file, used during
79  ingest.
80  """
81 
82  datasets: List[RawFileDatasetInfo]
83  """The information describing each dataset within this raw file.
84  (`list` of `RawFileDatasetInfo`)
85  """
86 
87  filename: str
88  """Name of the file this information was extracted from (`str`).
89 
90  This is the path prior to ingest, not the path after ingest.
91  """
92 
93  FormatterClass: Type[FitsRawFormatterBase]
94  """Formatter class that should be used to ingest this file and compute
95  a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
96  """
97 
98 
99 @dataclass
101  """Structure that holds information about a complete raw exposure, used
102  during ingest.
103  """
104 
105  dataId: DataCoordinate
106  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
107 
108  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
109  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
110  """
111 
112  files: List[RawFileData]
113  """List of structures containing file-level information.
114  """
115 
116  records: Optional[Dict[str, List[DimensionRecord]]] = None
117  """Dictionary containing `DimensionRecord` instances that must be inserted
118  into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
119 
120  Keys are the names of dimension elements ("exposure" and optionally "visit"
121  and "visit_detector_region"), while values are lists of `DimensionRecord`.
122 
123  May be `None` during some ingest steps.
124  """
125 
126 
127 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
128  """Create a Config field with options for how to transfer files between
129  data repositories.
130 
131  The allowed options for the field are exactly those supported by
132  `lsst.daf.butler.Datastore.ingest`.
133 
134  Parameters
135  ----------
136  doc : `str`
137  Documentation for the configuration field.
138 
139  Returns
140  -------
141  field : `lsst.pex.config.ChoiceField`
142  Configuration field.
143  """
144  return ChoiceField(
145  doc=doc,
146  dtype=str,
147  allowed={"move": "move",
148  "copy": "copy",
149  "hardlink": "hard link",
150  "symlink": "symbolic (soft) link"},
151  optional=True,
152  default=default
153  )
154 
155 
156 class RawIngestConfig(Config):
158  padRegionAmount = Field(
159  dtype=int,
160  default=0,
161  doc="Pad an image with specified number of pixels before calculating region"
162  )
163  instrument = Field(
164  doc=("Fully-qualified Python name of the `Instrument` subclass to "
165  "associate with all raws."),
166  dtype=str,
167  optional=False,
168  default=None,
169  )
170 
171 
172 class RawIngestTask(Task):
173  """Driver Task for ingesting raw data into Gen3 Butler repositories.
174 
175  This Task is intended to be runnable from the command-line, but it doesn't
176  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
177  gain much from being one. It also wouldn't really be appropriate as a
178  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
179  leverage the logging and configurability functionality that provides.
180 
181  Each instance of `RawIngestTask` writes to the same Butler. Each
182  invocation of `RawIngestTask.run` ingests a list of files.
183 
184  Parameters
185  ----------
186  config : `RawIngestConfig`
187  Configuration for the task.
188  butler : `~lsst.daf.butler.Butler`
189  Butler instance. Ingested Datasets will be created as part of
190  ``butler.run`` and associated with its Collection.
191  kwds
192  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
193  constructor.
194 
195  Other keyword arguments are forwarded to the Task base class constructor.
196  """
197 
198  ConfigClass = RawIngestConfig
199 
200  _DefaultName = "ingest"
201 
202  def getDatasetType(self):
203  """Return the DatasetType of the Datasets ingested by this Task.
204  """
205  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
206  universe=self.butler.registry.dimensions)
207 
208  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
209  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
210  super().__init__(config, **kwds)
211  self.butler = butler
212  self.universe = self.butler.registry.dimensions
213  self.instrument = doImport(self.config.instrument)()
214  # For now, we get a nominal Camera from the Instrument.
215  # In the future, we may want to load one from a Butler calibration
216  # collection that's appropriate for the observation timestamp of
217  # the exposure.
218  self.camera = self.instrument.getCamera()
220 
221  def extractMetadata(self, filename: str) -> RawFileData:
222  """Extract and process metadata from a single raw file.
223 
224  Parameters
225  ----------
226  filename : `str`
227  Path to the file.
228 
229  Returns
230  -------
231  data : `RawFileData`
232  A structure containing the metadata extracted from the file,
233  as well as the original filename. All fields will be populated,
234  but the `RawFileData.dataId` attribute will be a minimal
235  (unexpanded) `DataCoordinate` instance.
236 
237  Notes
238  -----
239  Assumes that there is a single dataset associated with the given
240  file. Instruments using a single file to store multiple datasets
241  must implement their own version of this method.
242  """
243  # Manually merge the primary and "first data" headers here because we
244  # do not know in general if an input file has set INHERIT=T.
245  phdu = readMetadata(filename, 0)
246  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
247  fix_header(header)
248  datasets = [self._calculate_dataset_info(header, filename)]
249 
250  # The data model currently assumes that whilst multiple datasets
251  # can be associated with a single file, they must all share the
252  # same formatter.
253  FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId)
254 
255  return RawFileData(datasets=datasets, filename=filename,
256  FormatterClass=FormatterClass)
257 
258  def _calculate_dataset_info(self, header, filename):
259  """Calculate a RawFileDatasetInfo from the supplied information.
260 
261  Parameters
262  ----------
263  header : `Mapping`
264  Header from the dataset.
265  filename : `str`
266  Filename to use for error messages.
267 
268  Returns
269  -------
270  dataset : `RawFileDatasetInfo`
271  The region, dataId, and observation information associated with
272  this dataset.
273  """
274  obsInfo = ObservationInfo(header)
275  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
276  exposure=obsInfo.exposure_id,
277  detector=obsInfo.detector_num,
278  universe=self.universe)
279  if obsInfo.instrument != self.instrument.getName():
280  raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
281  f"got {obsInfo.instrument}) for file {filename}.")
282 
283  FormatterClass = self.instrument.getRawFormatter(dataId)
284  region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass)
285  return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)
286 
287  def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
288  """Calculate the sky region covered by the supplied observation
289  information.
290 
291  Parameters
292  ----------
293  obsInfo : `~astro_metadata_translator.ObservationInfo`
294  Summary information of this dataset.
295  header : `Mapping`
296  Header from the dataset.
297  FormatterClass: `type` as subclass of `FitsRawFormatterBase`
298  Formatter class that should be used to compute the spatial region.
299 
300  Returns
301  -------
302  region : `lsst.sphgeom.ConvexPolygon`
303  Region of sky covered by this observation.
304  """
305  if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
306  formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
307  visitInfo = formatter.makeVisitInfo()
308  detector = self.camera[obsInfo.detector_num]
309  wcs = formatter.makeWcs(visitInfo, detector)
310  pixBox = Box2D(detector.getBBox())
311  if self.config.padRegionAmount > 0:
312  pixBox.grow(self.config.padRegionAmount)
313  pixCorners = pixBox.getCorners()
314  sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
315  region = ConvexPolygon(sphCorners)
316  else:
317  region = None
318  return region
319 
320  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
321  """Group an iterable of `RawFileData` by exposure.
322 
323  Parameters
324  ----------
325  files : iterable of `RawFileData`
326  File-level information to group.
327 
328  Returns
329  -------
330  exposures : `list` of `RawExposureData`
331  A list of structures that group the file-level information by
332  exposure. The `RawExposureData.records` attributes of elements
333  will be `None`, but all other fields will be populated. The
334  `RawExposureData.dataId` attributes will be minimal (unexpanded)
335  `DataCoordinate` instances.
336  """
337  exposureDimensions = self.universe["exposure"].graph
338  byExposure = defaultdict(list)
339  for f in files:
340  # Assume that the first dataset is representative for the file
341  byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
342 
343  return [RawExposureData(dataId=dataId, files=exposureFiles)
344  for dataId, exposureFiles in byExposure.items()]
345 
346  def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
347  """Collect the `DimensionRecord` instances that must be inserted into
348  the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
349 
350  Parameters
351  ----------
352  exposure : `RawExposureData`
353  A structure containing information about the exposure to be
354  ingested. Should be considered consumed upon return.
355 
356  Returns
357  -------
358  exposure : `RawExposureData`
359  An updated version of the input structure, with
360  `RawExposureData.records` populated.
361  """
362  firstFile = exposure.files[0]
363  firstDataset = firstFile.datasets[0]
364  VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
365  exposure.records = {
366  "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)],
367  }
368  if firstDataset.obsInfo.visit_id is not None:
369  exposure.records["visit_detector_region"] = []
370  visitVertices = []
371  for file in exposure.files:
372  for dataset in file.datasets:
373  if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
374  raise ValueError(f"Inconsistent visit/exposure relationship for "
375  f"exposure {firstDataset.obsInfo.exposure_id} between "
376  f"{file.filename} and {firstFile.filename}: "
377  f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
378  if dataset.region is None:
379  self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
380  dataset.obsInfo.detector_num)
381  continue
382  visitVertices.extend(dataset.region.getVertices())
383  exposure.records["visit_detector_region"].append(
384  VisitDetectorRegionRecordClass.fromDict({
385  "instrument": dataset.obsInfo.instrument,
386  "visit": dataset.obsInfo.visit_id,
387  "detector": dataset.obsInfo.detector_num,
388  "region": dataset.region,
389  })
390  )
391  if visitVertices:
392  visitRegion = ConvexPolygon(visitVertices)
393  else:
394  self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id)
395  visitRegion = None
396  exposure.records["visit"] = [
397  makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion)
398  ]
399  return exposure
400 
401  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
402  """Expand the data IDs associated with a raw exposure to include
403  additional metadata records.
404 
405  Parameters
406  ----------
407  exposure : `RawExposureData`
408  A structure containing information about the exposure to be
409  ingested. Must have `RawExposureData.records` populated. Should
410  be considered consumed upon return.
411 
412  Returns
413  -------
414  exposure : `RawExposureData`
415  An updated version of the input structure, with
416  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
417  containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
418  """
419  hasVisit = "visit" in data.records
420  # We start by expanded the exposure-level data ID; we won't use that
421  # directly in file ingest, but this lets us do some database lookups
422  # once per exposure instead of once per file later.
423  data.dataId = self.butler.registry.expandDataId(
424  data.dataId,
425  # We pass in the records we'll be inserting shortly so they aren't
426  # looked up from the database. We do expect instrument and filter
427  # records to be retrieved from the database here (though the
428  # Registry may cache them so there isn't a lookup every time).
429  records={
430  "exposure": data.records["exposure"][0],
431  "visit": data.records["visit"][0] if hasVisit else None,
432  }
433  )
434  # Now we expand the per-file (exposure+detector) data IDs. This time
435  # we pass in the records we just retrieved from the exposure data ID
436  # expansion as well as the visit_detector_region record, if there is
437  # one.
438  vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
439  for file, vdrRecord in zip(data.files, vdrRecords):
440  for dataset in file.datasets:
441  dataset.dataId = self.butler.registry.expandDataId(
442  dataset.dataId,
443  records=dict(data.dataId.records, visit_detector_region=vdrRecord)
444  )
445  return data
446 
447  def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
448  """Perform all ingest preprocessing steps that do not involve actually
449  modifying the database.
450 
451  Parameters
452  ----------
453  files : iterable over `str` or path-like objects
454  Paths to the files to be ingested. Will be made absolute
455  if they are not already.
456  pool : `multiprocessing.Pool`, optional
457  If not `None`, a process pool with which to parallelize some
458  operations.
459  processes : `int`, optional
460  The number of processes to use. Ignored if ``pool`` is not `None`.
461 
462  Yields
463  ------
464  exposure : `RawExposureData`
465  Data structures containing dimension records, filenames, and data
466  IDs to be ingested (one structure for each exposure).
467  """
468  if pool is None and processes > 1:
469  pool = Pool(processes)
470  mapFunc = map if pool is None else pool.imap_unordered
471 
472  # Extract metadata and build per-detector regions.
473  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
474 
475  # Use that metadata to group files (and extracted metadata) by
476  # exposure. Never parallelized because it's intrinsically a gather
477  # step.
478  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
479 
480  # The next few operations operate on RawExposureData instances (one at
481  # a time) in-place and then return the modified instance. We call them
482  # as pass-throughs instead of relying on the arguments we pass in to
483  # have been modified because in the parallel case those arguments are
484  # going to be pickled and unpickled, and I'm not certain
485  # multiprocessing is careful enough with that for output arguments to
486  # work. We use the same variable names to reflect the fact that we
487  # consider the arguments to have been consumed/invalidated.
488 
489  # Extract DimensionRecords from the metadata that will need to be
490  # inserted into the Registry before the raw datasets themselves are
491  # ingested.
492  exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
493 
494  # Expand the data IDs to include all dimension metadata; we need this
495  # because we may need to generate path templates that rely on that
496  # metadata.
497  # This is the first step that involves actual database calls (but just
498  # SELECTs), so if there's going to be a problem with connections vs.
499  # multiple processes, or lock contention (in SQLite) slowing things
500  # down, it'll happen here.
501  return mapFunc(self.expandDataIds, exposureData)
502 
503  def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
504  """Insert dimension records for one or more exposures.
505 
506  Parameters
507  ----------
508  records : `dict` mapping `str` to `list`
509  Dimension records to be inserted, organized as a mapping from
510  dimension name to a list of records for that dimension. This
511  may be a single `RawExposureData.records` dict, or an aggregate
512  for multiple exposures created by concatenating the value lists
513  of those dictionaries.
514 
515  Returns
516  -------
517  refs : `list` of `lsst.daf.butler.DatasetRef`
518  Dataset references for ingested raws.
519  """
520  # TODO: This currently assumes that either duplicate inserts of
521  # visit records are ignored, or there is exactly one visit per
522  # exposure. I expect us to switch up the visit-exposure
523  # relationship and hence rewrite some of this code before that
524  # becomes a practical problem.
525  # Iterate over dimensions explicitly to order for foreign key
526  # relationships.
527  for dimension in ("visit", "exposure", "visit_detector_region"):
528  recordsForDimension = records.get(dimension)
529  if recordsForDimension:
530  # TODO: once Registry has options to ignore or replace
531  # existing dimension records with the same primary keys
532  # instead of aborting on conflicts, add configuration
533  # options and logic to use them.
534  self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
535 
536  def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
537  ) -> List[DatasetRef]:
538  """Ingest all raw files in one exposure.
539 
540  Parameters
541  ----------
542  exposure : `RawExposureData`
543  A structure containing information about the exposure to be
544  ingested. Must have `RawExposureData.records` populated and all
545  data ID attributes expanded.
546  butler : `lsst.daf.butler.Butler`, optional
547  Butler to use for ingest. If not provided, ``self.butler`` will
548  be used.
549 
550  Returns
551  -------
552  refs : `list` of `lsst.daf.butler.DatasetRef`
553  Dataset references for ingested raws.
554  """
555  if butler is None:
556  butler = self.butler
557  datasets = [FileDataset(path=os.path.abspath(file.filename),
558  refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
559  formatter=file.FormatterClass)
560  for file in exposure.files]
561  butler.ingest(*datasets, transfer=self.config.transfer)
562  return [ref for dataset in datasets for ref in dataset.refs]
563 
564  def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
565  """Ingest files into a Butler data repository.
566 
567  This creates any new exposure or visit Dimension entries needed to
568  identify the ingested files, creates new Dataset entries in the
569  Registry and finally ingests the files themselves into the Datastore.
570  Any needed instrument, detector, and physical_filter Dimension entries
571  must exist in the Registry before `run` is called.
572 
573  Parameters
574  ----------
575  files : iterable over `str` or path-like objects
576  Paths to the files to be ingested. Will be made absolute
577  if they are not already.
578  pool : `multiprocessing.Pool`, optional
579  If not `None`, a process pool with which to parallelize some
580  operations.
581  processes : `int`, optional
582  The number of processes to use. Ignored if ``pool`` is not `None`.
583 
584  Returns
585  -------
586  refs : `list` of `lsst.daf.butler.DatasetRef`
587  Dataset references for ingested raws.
588 
589  Notes
590  -----
591  This method inserts all records (dimensions and datasets) for an
592  exposure within a transaction, guaranteeing that partial exposures
593  are never ingested.
594  """
595  exposureData = self.prep(files, pool=pool, processes=processes)
596  # Up to this point, we haven't modified the data repository at all.
597  # Now we finally do that, with one transaction per exposure. This is
598  # not parallelized at present because the performance of this step is
599  # limited by the database server. That may or may not change in the
600  # future once we increase our usage of bulk inserts and reduce our
601  # usage of savepoints; we've tried to get everything but the database
602  # operations done in advance to reduce the time spent inside
603  # transactions.
604  self.butler.registry.registerDatasetType(self.datasetType)
605  refs = []
606  for exposure in exposureData:
607  with self.butler.transaction():
608  self.insertDimensionData(exposure.records)
609  refs.extend(self.ingestExposureDatasets(exposure))
610  return refs
def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass)
Definition: ingest.py:287
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: instrument.py:144
def _calculate_dataset_info(self, header, filename)
Definition: ingest.py:258
def makeVisitRecordFromObsInfo(obsInfo, universe, region=None)
Definition: instrument.py:177
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:127