lsst.obs.base  19.0.0-18-g955d782+3
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 import itertools
27 from dataclasses import dataclass
28 from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections import defaultdict
30 from multiprocessing import Pool
31 
32 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33 from lsst.utils import doImport
34 from lsst.afw.fits import readMetadata
35 from lsst.daf.butler import (
36  Butler,
37  DataCoordinate,
38  DatasetRef,
39  DatasetType,
40  DimensionRecord,
41  FileDataset,
42 )
43 from lsst.obs.base.instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
44 from lsst.geom import Box2D
45 from lsst.pex.config import Config, Field, ChoiceField
46 from lsst.pipe.base import Task
47 from lsst.sphgeom import ConvexPolygon
48 
49 from .fitsRawFormatterBase import FitsRawFormatterBase
50 
51 
52 @dataclass
54  """Structure that hold information about a single dataset within a
55  raw file.
56  """
57 
58  dataId: DataCoordinate
59  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
60 
61  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63  """
64 
65  obsInfo: ObservationInfo
66  """Standardized observation metadata extracted directly from the file
67  headers (`astro_metadata_translator.ObservationInfo`).
68  """
69 
70  region: ConvexPolygon
71  """Region on the sky covered by this file, possibly with padding
72  (`lsst.sphgeom.ConvexPolygon`).
73  """
74 
75 
76 @dataclass
78  """Structure that holds information about a single raw file, used during
79  ingest.
80  """
81 
82  datasets: List[RawFileDatasetInfo]
83  """The information describing each dataset within this raw file.
84  (`list` of `RawFileDatasetInfo`)
85  """
86 
87  filename: str
88  """Name of the file this information was extracted from (`str`).
89 
90  This is the path prior to ingest, not the path after ingest.
91  """
92 
93  FormatterClass: Type[FitsRawFormatterBase]
94  """Formatter class that should be used to ingest this file and compute
95  a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
96  """
97 
98 
99 @dataclass
101  """Structure that holds information about a complete raw exposure, used
102  during ingest.
103  """
104 
105  dataId: DataCoordinate
106  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
107 
108  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
109  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
110  """
111 
112  files: List[RawFileData]
113  """List of structures containing file-level information.
114  """
115 
116  records: Optional[Dict[str, List[DimensionRecord]]] = None
117  """Dictionary containing `DimensionRecord` instances that must be inserted
118  into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
119 
120  Keys are the names of dimension elements ("exposure" and optionally "visit"
121  and "visit_detector_region"), while values are lists of `DimensionRecord`.
122 
123  May be `None` during some ingest steps.
124  """
125 
126 
127 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
128  """Create a Config field with options for how to transfer files between
129  data repositories.
130 
131  The allowed options for the field are exactly those supported by
132  `lsst.daf.butler.Datastore.ingest`.
133 
134  Parameters
135  ----------
136  doc : `str`
137  Documentation for the configuration field.
138 
139  Returns
140  -------
141  field : `lsst.pex.config.ChoiceField`
142  Configuration field.
143  """
144  return ChoiceField(
145  doc=doc,
146  dtype=str,
147  allowed={"move": "move",
148  "copy": "copy",
149  "hardlink": "hard link",
150  "symlink": "symbolic (soft) link"},
151  optional=True,
152  default=default
153  )
154 
155 
156 class RawIngestConfig(Config):
158  padRegionAmount = Field(
159  dtype=int,
160  default=0,
161  doc="Pad an image with specified number of pixels before calculating region"
162  )
163  instrument = Field(
164  doc=("Fully-qualified Python name of the `Instrument` subclass to "
165  "associate with all raws."),
166  dtype=str,
167  optional=False,
168  default=None,
169  )
170 
171 
172 class RawIngestTask(Task):
173  """Driver Task for ingesting raw data into Gen3 Butler repositories.
174 
175  This Task is intended to be runnable from the command-line, but it doesn't
176  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
177  gain much from being one. It also wouldn't really be appropriate as a
178  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
179  leverage the logging and configurability functionality that provides.
180 
181  Each instance of `RawIngestTask` writes to the same Butler. Each
182  invocation of `RawIngestTask.run` ingests a list of files.
183 
184  Parameters
185  ----------
186  config : `RawIngestConfig`
187  Configuration for the task.
188  butler : `~lsst.daf.butler.Butler`
189  Butler instance. Ingested Datasets will be created as part of
190  ``butler.run`` and associated with its Collection.
191  kwds
192  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
193  constructor.
194 
195  Other keyword arguments are forwarded to the Task base class constructor.
196  """
197 
198  ConfigClass = RawIngestConfig
199 
200  _DefaultName = "ingest"
201 
202  def getDatasetType(self):
203  """Return the DatasetType of the Datasets ingested by this Task.
204  """
205  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
206  universe=self.butler.registry.dimensions)
207 
208  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
209  super().__init__(config, **kwds)
210  self.butler = butler
211  self.universe = self.butler.registry.dimensions
212  self.instrument = doImport(self.config.instrument)()
213  # For now, we get a nominal Camera from the Instrument.
214  # In the future, we may want to load one from a Butler calibration
215  # collection that's appropriate for the observation timestamp of
216  # the exposure.
217  self.camera = self.instrument.getCamera()
219 
220  def extractMetadata(self, filename: str) -> RawFileData:
221  """Extract and process metadata from a single raw file.
222 
223  Parameters
224  ----------
225  filename : `str`
226  Path to the file.
227 
228  Returns
229  -------
230  data : `RawFileData`
231  A structure containing the metadata extracted from the file,
232  as well as the original filename. All fields will be populated,
233  but the `RawFileData.dataId` attribute will be a minimal
234  (unexpanded) `DataCoordinate` instance.
235 
236  Notes
237  -----
238  Assumes that there is a single dataset associated with the given
239  file. Instruments using a single file to store multiple datasets
240  must implement their own version of this method.
241  """
242  phdu = readMetadata(filename, 0)
243  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
244  fix_header(header)
245  datasets = [self._calculate_dataset_info(header, filename)]
246 
247  # The data model currently assumes that whilst multiple datasets
248  # can be associated with a single file, they must all share the
249  # same formatter.
250  FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId)
251 
252  return RawFileData(datasets=datasets, filename=filename,
253  FormatterClass=FormatterClass)
254 
255  def _calculate_dataset_info(self, header, filename):
256  """Calculate a RawFileDatasetInfo from the supplied information.
257 
258  Parameters
259  ----------
260  header : `Mapping`
261  Header from the dataset.
262  filename : `str`
263  Filename to use for error messages.
264 
265  Returns
266  -------
267  dataset : `RawFileDatasetInfo`
268  The region, dataId, and observation information associated with
269  this dataset.
270  """
271  obsInfo = ObservationInfo(header)
272  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
273  exposure=obsInfo.exposure_id,
274  detector=obsInfo.detector_num,
275  universe=self.universe)
276  if obsInfo.instrument != self.instrument.getName():
277  raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
278  f"got {obsInfo.instrument}) for file {filename}.")
279 
280  FormatterClass = self.instrument.getRawFormatter(dataId)
281  region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass)
282  return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)
283 
284  def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
285  """Calculate the sky region covered by the supplied observation
286  information.
287 
288  Parameters
289  ----------
290  obsInfo : `~astro_metadata_translator.ObservationInfo`
291  Summary information of this dataset.
292  header : `Mapping`
293  Header from the dataset.
294  FormatterClass: `type` as subclass of `FitsRawFormatterBase`
295  Formatter class that should be used to compute the spatial region.
296 
297  Returns
298  -------
299  region : `lsst.sphgeom.ConvexPolygon`
300  Region of sky covered by this observation.
301  """
302  if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
303  formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
304  visitInfo = formatter.makeVisitInfo()
305  detector = self.camera[obsInfo.detector_num]
306  wcs = formatter.makeWcs(visitInfo, detector)
307  pixBox = Box2D(detector.getBBox())
308  if self.config.padRegionAmount > 0:
309  pixBox.grow(self.config.padRegionAmount)
310  pixCorners = pixBox.getCorners()
311  sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
312  region = ConvexPolygon(sphCorners)
313  else:
314  region = None
315  return region
316 
317  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
318  """Group an iterable of `RawFileData` by exposure.
319 
320  Parameters
321  ----------
322  files : iterable of `RawFileData`
323  File-level information to group.
324 
325  Returns
326  -------
327  exposures : `list` of `RawExposureData`
328  A list of structures that group the file-level information by
329  exposure. The `RawExposureData.records` attributes of elements
330  will be `None`, but all other fields will be populated. The
331  `RawExposureData.dataId` attributes will be minimal (unexpanded)
332  `DataCoordinate` instances.
333  """
334  exposureDimensions = self.universe["exposure"].graph
335  byExposure = defaultdict(list)
336  for f in files:
337  # Assume that the first dataset is representative for the file
338  byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
339 
340  return [RawExposureData(dataId=dataId, files=exposureFiles)
341  for dataId, exposureFiles in byExposure.items()]
342 
343  def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
344  """Collect the `DimensionRecord` instances that must be inserted into
345  the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
346 
347  Parameters
348  ----------
349  exposure : `RawExposureData`
350  A structure containing information about the exposure to be
351  ingested. Should be considered consumed upon return.
352 
353  Returns
354  -------
355  exposure : `RawExposureData`
356  An updated version of the input structure, with
357  `RawExposureData.records` populated.
358  """
359  firstFile = exposure.files[0]
360  firstDataset = firstFile.datasets[0]
361  VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
362  exposure.records = {
363  "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)],
364  }
365  if firstDataset.obsInfo.visit_id is not None:
366  exposure.records["visit_detector_region"] = []
367  visitVertices = []
368  for file in exposure.files:
369  for dataset in file.datasets:
370  if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
371  raise ValueError(f"Inconsistent visit/exposure relationship for "
372  f"exposure {firstDataset.obsInfo.exposure_id} between "
373  f"{file.filename} and {firstFile.filename}: "
374  f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
375  if dataset.region is None:
376  self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
377  dataset.obsInfo.detector_num)
378  continue
379  visitVertices.extend(dataset.region.getVertices())
380  exposure.records["visit_detector_region"].append(
381  VisitDetectorRegionRecordClass.fromDict({
382  "instrument": dataset.obsInfo.instrument,
383  "visit": dataset.obsInfo.visit_id,
384  "detector": dataset.obsInfo.detector_num,
385  "region": dataset.region,
386  })
387  )
388  if visitVertices:
389  visitRegion = ConvexPolygon(visitVertices)
390  else:
391  self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id)
392  visitRegion = None
393  exposure.records["visit"] = [
394  makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion)
395  ]
396  return exposure
397 
398  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
399  """Expand the data IDs associated with a raw exposure to include
400  additional metadata records.
401 
402  Parameters
403  ----------
404  exposure : `RawExposureData`
405  A structure containing information about the exposure to be
406  ingested. Must have `RawExposureData.records` populated. Should
407  be considered consumed upon return.
408 
409  Returns
410  -------
411  exposure : `RawExposureData`
412  An updated version of the input structure, with
413  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
414  containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
415  """
416  hasVisit = "visit" in data.records
417  # We start by expanded the exposure-level data ID; we won't use that
418  # directly in file ingest, but this lets us do some database lookups
419  # once per exposure instead of once per file later.
420  data.dataId = self.butler.registry.expandDataId(
421  data.dataId,
422  # We pass in the records we'll be inserting shortly so they aren't
423  # looked up from the database. We do expect instrument and filter
424  # records to be retrieved from the database here (though the
425  # Registry may cache them so there isn't a lookup every time).
426  records={
427  "exposure": data.records["exposure"][0],
428  "visit": data.records["visit"][0] if hasVisit else None,
429  }
430  )
431  # Now we expand the per-file (exposure+detector) data IDs. This time
432  # we pass in the records we just retrieved from the exposure data ID
433  # expansion as well as the visit_detector_region record, if there is
434  # one.
435  vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
436  for file, vdrRecord in zip(data.files, vdrRecords):
437  for dataset in file.datasets:
438  dataset.dataId = self.butler.registry.expandDataId(
439  dataset.dataId,
440  records=dict(data.dataId.records, visit_detector_region=vdrRecord)
441  )
442  return data
443 
444  def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
445  """Perform all ingest preprocessing steps that do not involve actually
446  modifying the database.
447 
448  Parameters
449  ----------
450  files : iterable over `str` or path-like objects
451  Paths to the files to be ingested. Will be made absolute
452  if they are not already.
453  pool : `multiprocessing.Pool`, optional
454  If not `None`, a process pool with which to parallelize some
455  operations.
456  processes : `int`, optional
457  The number of processes to use. Ignored if ``pool`` is not `None`.
458 
459  Yields
460  ------
461  exposure : `RawExposureData`
462  Data structures containing dimension records, filenames, and data
463  IDs to be ingested (one structure for each exposure).
464  """
465  if pool is None and processes > 1:
466  pool = Pool(processes)
467  mapFunc = map if pool is None else pool.imap_unordered
468 
469  # Extract metadata and build per-detector regions.
470  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
471 
472  # Use that metadata to group files (and extracted metadata) by
473  # exposure. Never parallelized because it's intrinsically a gather
474  # step.
475  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
476 
477  # The next few operations operate on RawExposureData instances (one at
478  # a time) in-place and then return the modified instance. We call them
479  # as pass-throughs instead of relying on the arguments we pass in to
480  # have been modified because in the parallel case those arguments are
481  # going to be pickled and unpickled, and I'm not certain
482  # multiprocessing is careful enough with that for output arguments to
483  # work. We use the same variable names to reflect the fact that we
484  # consider the arguments to have been consumed/invalidated.
485 
486  # Extract DimensionRecords from the metadata that will need to be
487  # inserted into the Registry before the raw datasets themselves are
488  # ingested.
489  exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
490 
491  # Expand the data IDs to include all dimension metadata; we need this
492  # because we may need to generate path templates that rely on that
493  # metadata.
494  # This is the first step that involves actual database calls (but just
495  # SELECTs), so if there's going to be a problem with connections vs.
496  # multiple processes, or lock contention (in SQLite) slowing things
497  # down, it'll happen here.
498  return mapFunc(self.expandDataIds, exposureData)
499 
500  def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
501  """Insert dimension records for one or more exposures.
502 
503  Parameters
504  ----------
505  records : `dict` mapping `str` to `list`
506  Dimension records to be inserted, organized as a mapping from
507  dimension name to a list of records for that dimension. This
508  may be a single `RawExposureData.records` dict, or an aggregate
509  for multiple exposures created by concatenating the value lists
510  of those dictionaries.
511 
512  Returns
513  -------
514  refs : `list` of `lsst.daf.butler.DatasetRef`
515  Dataset references for ingested raws.
516  """
517  # TODO: This currently assumes that either duplicate inserts of
518  # visit records are ignored, or there is exactly one visit per
519  # exposure. I expect us to switch up the visit-exposure
520  # relationship and hence rewrite some of this code before that
521  # becomes a practical problem.
522  # Iterate over dimensions explicitly to order for foreign key
523  # relationships.
524  for dimension in ("visit", "exposure", "visit_detector_region"):
525  recordsForDimension = records.get(dimension)
526  if recordsForDimension:
527  # TODO: once Registry has options to ignore or replace
528  # existing dimension records with the same primary keys
529  # instead of aborting on conflicts, add configuration
530  # options and logic to use them.
531  self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
532 
533  def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
534  ) -> List[DatasetRef]:
535  """Ingest all raw files in one exposure.
536 
537  Parameters
538  ----------
539  exposure : `RawExposureData`
540  A structure containing information about the exposure to be
541  ingested. Must have `RawExposureData.records` populated and all
542  data ID attributes expanded.
543  butler : `lsst.daf.butler.Butler`, optional
544  Butler to use for ingest. If not provided, ``self.butler`` will
545  be used.
546 
547  Returns
548  -------
549  refs : `list` of `lsst.daf.butler.DatasetRef`
550  Dataset references for ingested raws.
551  """
552  if butler is None:
553  butler = self.butler
554  datasets = [FileDataset(path=os.path.abspath(file.filename),
555  refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
556  formatter=file.FormatterClass)
557  for file in exposure.files]
558  butler.ingest(*datasets, transfer=self.config.transfer)
559  return [ref for dataset in datasets for ref in dataset.refs]
560 
561  def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
562  """Ingest files into a Butler data repository.
563 
564  This creates any new exposure or visit Dimension entries needed to
565  identify the ingested files, creates new Dataset entries in the
566  Registry and finally ingests the files themselves into the Datastore.
567  Any needed instrument, detector, and physical_filter Dimension entries
568  must exist in the Registry before `run` is called.
569 
570  Parameters
571  ----------
572  files : iterable over `str` or path-like objects
573  Paths to the files to be ingested. Will be made absolute
574  if they are not already.
575  pool : `multiprocessing.Pool`, optional
576  If not `None`, a process pool with which to parallelize some
577  operations.
578  processes : `int`, optional
579  The number of processes to use. Ignored if ``pool`` is not `None`.
580 
581  Returns
582  -------
583  refs : `list` of `lsst.daf.butler.DatasetRef`
584  Dataset references for ingested raws.
585 
586  Notes
587  -----
588  This method inserts all records (dimensions and datasets) for an
589  exposure within a transaction, guaranteeing that partial exposures
590  are never ingested.
591  """
592  exposureData = self.prep(files, pool=pool, processes=processes)
593  # Up to this point, we haven't modified the data repository at all.
594  # Now we finally do that, with one transaction per exposure. This is
595  # not parallelized at present because the performance of this step is
596  # limited by the database server. That may or may not change in the
597  # future once we increase our usage of bulk inserts and reduce our
598  # usage of savepoints; we've tried to get everything but the database
599  # operations done in advance to reduce the time spent inside
600  # transactions.
601  self.butler.registry.registerDatasetType(self.datasetType)
602  refs = []
603  for exposure in exposureData:
604  with self.butler.transaction():
605  self.insertDimensionData(exposure.records)
606  refs.extend(self.ingestExposureDatasets(exposure))
607  return refs
def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass)
Definition: ingest.py:284
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: instrument.py:143
def _calculate_dataset_info(self, header, filename)
Definition: ingest.py:255
def makeVisitRecordFromObsInfo(obsInfo, universe, region=None)
Definition: instrument.py:176
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:127