lsst.obs.base  19.0.0-20-g6de566f+1
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 import itertools
27 from dataclasses import dataclass
28 from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections import defaultdict
30 from multiprocessing import Pool
31 
32 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33 from lsst.utils import doImport
34 from lsst.afw.fits import readMetadata
35 from lsst.daf.butler import (
36  Butler,
37  DataCoordinate,
38  DatasetRef,
39  DatasetType,
40  DimensionRecord,
41  FileDataset,
42 )
43 from lsst.obs.base.instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
44 from lsst.geom import Box2D
45 from lsst.pex.config import Config, Field, ChoiceField
46 from lsst.pipe.base import Task
47 from lsst.sphgeom import ConvexPolygon
48 
49 from .fitsRawFormatterBase import FitsRawFormatterBase
50 
51 
52 @dataclass
54  """Structure that hold information about a single dataset within a
55  raw file.
56  """
57 
58  dataId: DataCoordinate
59  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
60 
61  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63  """
64 
65  obsInfo: ObservationInfo
66  """Standardized observation metadata extracted directly from the file
67  headers (`astro_metadata_translator.ObservationInfo`).
68  """
69 
70  region: ConvexPolygon
71  """Region on the sky covered by this file, possibly with padding
72  (`lsst.sphgeom.ConvexPolygon`).
73  """
74 
75 
76 @dataclass
78  """Structure that holds information about a single raw file, used during
79  ingest.
80  """
81 
82  datasets: List[RawFileDatasetInfo]
83  """The information describing each dataset within this raw file.
84  (`list` of `RawFileDatasetInfo`)
85  """
86 
87  filename: str
88  """Name of the file this information was extracted from (`str`).
89 
90  This is the path prior to ingest, not the path after ingest.
91  """
92 
93  FormatterClass: Type[FitsRawFormatterBase]
94  """Formatter class that should be used to ingest this file and compute
95  a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
96  """
97 
98 
99 @dataclass
101  """Structure that holds information about a complete raw exposure, used
102  during ingest.
103  """
104 
105  dataId: DataCoordinate
106  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
107 
108  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
109  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
110  """
111 
112  files: List[RawFileData]
113  """List of structures containing file-level information.
114  """
115 
116  records: Optional[Dict[str, List[DimensionRecord]]] = None
117  """Dictionary containing `DimensionRecord` instances that must be inserted
118  into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
119 
120  Keys are the names of dimension elements ("exposure" and optionally "visit"
121  and "visit_detector_region"), while values are lists of `DimensionRecord`.
122 
123  May be `None` during some ingest steps.
124  """
125 
126 
127 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
128  """Create a Config field with options for how to transfer files between
129  data repositories.
130 
131  The allowed options for the field are exactly those supported by
132  `lsst.daf.butler.Datastore.ingest`.
133 
134  Parameters
135  ----------
136  doc : `str`
137  Documentation for the configuration field.
138 
139  Returns
140  -------
141  field : `lsst.pex.config.ChoiceField`
142  Configuration field.
143  """
144  return ChoiceField(
145  doc=doc,
146  dtype=str,
147  allowed={"move": "move",
148  "copy": "copy",
149  "hardlink": "hard link",
150  "symlink": "symbolic (soft) link"},
151  optional=True,
152  default=default
153  )
154 
155 
156 class RawIngestConfig(Config):
158  padRegionAmount = Field(
159  dtype=int,
160  default=0,
161  doc="Pad an image with specified number of pixels before calculating region"
162  )
163  instrument = Field(
164  doc=("Fully-qualified Python name of the `Instrument` subclass to "
165  "associate with all raws."),
166  dtype=str,
167  optional=False,
168  default=None,
169  )
170 
171 
172 class RawIngestTask(Task):
173  """Driver Task for ingesting raw data into Gen3 Butler repositories.
174 
175  This Task is intended to be runnable from the command-line, but it doesn't
176  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
177  gain much from being one. It also wouldn't really be appropriate as a
178  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
179  leverage the logging and configurability functionality that provides.
180 
181  Each instance of `RawIngestTask` writes to the same Butler. Each
182  invocation of `RawIngestTask.run` ingests a list of files.
183 
184  Parameters
185  ----------
186  config : `RawIngestConfig`
187  Configuration for the task.
188  butler : `~lsst.daf.butler.Butler`
189  Butler instance. Ingested Datasets will be created as part of
190  ``butler.run`` and associated with its Collection.
191  kwds
192  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
193  constructor.
194 
195  Other keyword arguments are forwarded to the Task base class constructor.
196  """
197 
198  ConfigClass = RawIngestConfig
199 
200  _DefaultName = "ingest"
201 
202  def getDatasetType(self):
203  """Return the DatasetType of the Datasets ingested by this Task.
204  """
205  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
206  universe=self.butler.registry.dimensions)
207 
208  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
209  super().__init__(config, **kwds)
210  self.butler = butler
211  self.universe = self.butler.registry.dimensions
212  self.instrument = doImport(self.config.instrument)()
213  # For now, we get a nominal Camera from the Instrument.
214  # In the future, we may want to load one from a Butler calibration
215  # collection that's appropriate for the observation timestamp of
216  # the exposure.
217  self.camera = self.instrument.getCamera()
219 
220  def extractMetadata(self, filename: str) -> RawFileData:
221  """Extract and process metadata from a single raw file.
222 
223  Parameters
224  ----------
225  filename : `str`
226  Path to the file.
227 
228  Returns
229  -------
230  data : `RawFileData`
231  A structure containing the metadata extracted from the file,
232  as well as the original filename. All fields will be populated,
233  but the `RawFileData.dataId` attribute will be a minimal
234  (unexpanded) `DataCoordinate` instance.
235 
236  Notes
237  -----
238  Assumes that there is a single dataset associated with the given
239  file. Instruments using a single file to store multiple datasets
240  must implement their own version of this method.
241  """
242  # Manually merge the primary and "first data" headers here because we
243  # do not know in general if an input file has set INHERIT=T.
244  phdu = readMetadata(filename, 0)
245  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
246  fix_header(header)
247  datasets = [self._calculate_dataset_info(header, filename)]
248 
249  # The data model currently assumes that whilst multiple datasets
250  # can be associated with a single file, they must all share the
251  # same formatter.
252  FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId)
253 
254  return RawFileData(datasets=datasets, filename=filename,
255  FormatterClass=FormatterClass)
256 
257  def _calculate_dataset_info(self, header, filename):
258  """Calculate a RawFileDatasetInfo from the supplied information.
259 
260  Parameters
261  ----------
262  header : `Mapping`
263  Header from the dataset.
264  filename : `str`
265  Filename to use for error messages.
266 
267  Returns
268  -------
269  dataset : `RawFileDatasetInfo`
270  The region, dataId, and observation information associated with
271  this dataset.
272  """
273  obsInfo = ObservationInfo(header)
274  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
275  exposure=obsInfo.exposure_id,
276  detector=obsInfo.detector_num,
277  universe=self.universe)
278  if obsInfo.instrument != self.instrument.getName():
279  raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
280  f"got {obsInfo.instrument}) for file {filename}.")
281 
282  FormatterClass = self.instrument.getRawFormatter(dataId)
283  region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass)
284  return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)
285 
286  def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
287  """Calculate the sky region covered by the supplied observation
288  information.
289 
290  Parameters
291  ----------
292  obsInfo : `~astro_metadata_translator.ObservationInfo`
293  Summary information of this dataset.
294  header : `Mapping`
295  Header from the dataset.
296  FormatterClass: `type` as subclass of `FitsRawFormatterBase`
297  Formatter class that should be used to compute the spatial region.
298 
299  Returns
300  -------
301  region : `lsst.sphgeom.ConvexPolygon`
302  Region of sky covered by this observation.
303  """
304  if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
305  formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
306  visitInfo = formatter.makeVisitInfo()
307  detector = self.camera[obsInfo.detector_num]
308  wcs = formatter.makeWcs(visitInfo, detector)
309  pixBox = Box2D(detector.getBBox())
310  if self.config.padRegionAmount > 0:
311  pixBox.grow(self.config.padRegionAmount)
312  pixCorners = pixBox.getCorners()
313  sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
314  region = ConvexPolygon(sphCorners)
315  else:
316  region = None
317  return region
318 
319  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
320  """Group an iterable of `RawFileData` by exposure.
321 
322  Parameters
323  ----------
324  files : iterable of `RawFileData`
325  File-level information to group.
326 
327  Returns
328  -------
329  exposures : `list` of `RawExposureData`
330  A list of structures that group the file-level information by
331  exposure. The `RawExposureData.records` attributes of elements
332  will be `None`, but all other fields will be populated. The
333  `RawExposureData.dataId` attributes will be minimal (unexpanded)
334  `DataCoordinate` instances.
335  """
336  exposureDimensions = self.universe["exposure"].graph
337  byExposure = defaultdict(list)
338  for f in files:
339  # Assume that the first dataset is representative for the file
340  byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
341 
342  return [RawExposureData(dataId=dataId, files=exposureFiles)
343  for dataId, exposureFiles in byExposure.items()]
344 
345  def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
346  """Collect the `DimensionRecord` instances that must be inserted into
347  the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
348 
349  Parameters
350  ----------
351  exposure : `RawExposureData`
352  A structure containing information about the exposure to be
353  ingested. Should be considered consumed upon return.
354 
355  Returns
356  -------
357  exposure : `RawExposureData`
358  An updated version of the input structure, with
359  `RawExposureData.records` populated.
360  """
361  firstFile = exposure.files[0]
362  firstDataset = firstFile.datasets[0]
363  VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
364  exposure.records = {
365  "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)],
366  }
367  if firstDataset.obsInfo.visit_id is not None:
368  exposure.records["visit_detector_region"] = []
369  visitVertices = []
370  for file in exposure.files:
371  for dataset in file.datasets:
372  if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
373  raise ValueError(f"Inconsistent visit/exposure relationship for "
374  f"exposure {firstDataset.obsInfo.exposure_id} between "
375  f"{file.filename} and {firstFile.filename}: "
376  f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
377  if dataset.region is None:
378  self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
379  dataset.obsInfo.detector_num)
380  continue
381  visitVertices.extend(dataset.region.getVertices())
382  exposure.records["visit_detector_region"].append(
383  VisitDetectorRegionRecordClass.fromDict({
384  "instrument": dataset.obsInfo.instrument,
385  "visit": dataset.obsInfo.visit_id,
386  "detector": dataset.obsInfo.detector_num,
387  "region": dataset.region,
388  })
389  )
390  if visitVertices:
391  visitRegion = ConvexPolygon(visitVertices)
392  else:
393  self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id)
394  visitRegion = None
395  exposure.records["visit"] = [
396  makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion)
397  ]
398  return exposure
399 
400  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
401  """Expand the data IDs associated with a raw exposure to include
402  additional metadata records.
403 
404  Parameters
405  ----------
406  exposure : `RawExposureData`
407  A structure containing information about the exposure to be
408  ingested. Must have `RawExposureData.records` populated. Should
409  be considered consumed upon return.
410 
411  Returns
412  -------
413  exposure : `RawExposureData`
414  An updated version of the input structure, with
415  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
416  containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
417  """
418  hasVisit = "visit" in data.records
419  # We start by expanded the exposure-level data ID; we won't use that
420  # directly in file ingest, but this lets us do some database lookups
421  # once per exposure instead of once per file later.
422  data.dataId = self.butler.registry.expandDataId(
423  data.dataId,
424  # We pass in the records we'll be inserting shortly so they aren't
425  # looked up from the database. We do expect instrument and filter
426  # records to be retrieved from the database here (though the
427  # Registry may cache them so there isn't a lookup every time).
428  records={
429  "exposure": data.records["exposure"][0],
430  "visit": data.records["visit"][0] if hasVisit else None,
431  }
432  )
433  # Now we expand the per-file (exposure+detector) data IDs. This time
434  # we pass in the records we just retrieved from the exposure data ID
435  # expansion as well as the visit_detector_region record, if there is
436  # one.
437  vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
438  for file, vdrRecord in zip(data.files, vdrRecords):
439  for dataset in file.datasets:
440  dataset.dataId = self.butler.registry.expandDataId(
441  dataset.dataId,
442  records=dict(data.dataId.records, visit_detector_region=vdrRecord)
443  )
444  return data
445 
446  def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
447  """Perform all ingest preprocessing steps that do not involve actually
448  modifying the database.
449 
450  Parameters
451  ----------
452  files : iterable over `str` or path-like objects
453  Paths to the files to be ingested. Will be made absolute
454  if they are not already.
455  pool : `multiprocessing.Pool`, optional
456  If not `None`, a process pool with which to parallelize some
457  operations.
458  processes : `int`, optional
459  The number of processes to use. Ignored if ``pool`` is not `None`.
460 
461  Yields
462  ------
463  exposure : `RawExposureData`
464  Data structures containing dimension records, filenames, and data
465  IDs to be ingested (one structure for each exposure).
466  """
467  if pool is None and processes > 1:
468  pool = Pool(processes)
469  mapFunc = map if pool is None else pool.imap_unordered
470 
471  # Extract metadata and build per-detector regions.
472  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
473 
474  # Use that metadata to group files (and extracted metadata) by
475  # exposure. Never parallelized because it's intrinsically a gather
476  # step.
477  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
478 
479  # The next few operations operate on RawExposureData instances (one at
480  # a time) in-place and then return the modified instance. We call them
481  # as pass-throughs instead of relying on the arguments we pass in to
482  # have been modified because in the parallel case those arguments are
483  # going to be pickled and unpickled, and I'm not certain
484  # multiprocessing is careful enough with that for output arguments to
485  # work. We use the same variable names to reflect the fact that we
486  # consider the arguments to have been consumed/invalidated.
487 
488  # Extract DimensionRecords from the metadata that will need to be
489  # inserted into the Registry before the raw datasets themselves are
490  # ingested.
491  exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
492 
493  # Expand the data IDs to include all dimension metadata; we need this
494  # because we may need to generate path templates that rely on that
495  # metadata.
496  # This is the first step that involves actual database calls (but just
497  # SELECTs), so if there's going to be a problem with connections vs.
498  # multiple processes, or lock contention (in SQLite) slowing things
499  # down, it'll happen here.
500  return mapFunc(self.expandDataIds, exposureData)
501 
502  def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
503  """Insert dimension records for one or more exposures.
504 
505  Parameters
506  ----------
507  records : `dict` mapping `str` to `list`
508  Dimension records to be inserted, organized as a mapping from
509  dimension name to a list of records for that dimension. This
510  may be a single `RawExposureData.records` dict, or an aggregate
511  for multiple exposures created by concatenating the value lists
512  of those dictionaries.
513 
514  Returns
515  -------
516  refs : `list` of `lsst.daf.butler.DatasetRef`
517  Dataset references for ingested raws.
518  """
519  # TODO: This currently assumes that either duplicate inserts of
520  # visit records are ignored, or there is exactly one visit per
521  # exposure. I expect us to switch up the visit-exposure
522  # relationship and hence rewrite some of this code before that
523  # becomes a practical problem.
524  # Iterate over dimensions explicitly to order for foreign key
525  # relationships.
526  for dimension in ("visit", "exposure", "visit_detector_region"):
527  recordsForDimension = records.get(dimension)
528  if recordsForDimension:
529  # TODO: once Registry has options to ignore or replace
530  # existing dimension records with the same primary keys
531  # instead of aborting on conflicts, add configuration
532  # options and logic to use them.
533  self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
534 
535  def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
536  ) -> List[DatasetRef]:
537  """Ingest all raw files in one exposure.
538 
539  Parameters
540  ----------
541  exposure : `RawExposureData`
542  A structure containing information about the exposure to be
543  ingested. Must have `RawExposureData.records` populated and all
544  data ID attributes expanded.
545  butler : `lsst.daf.butler.Butler`, optional
546  Butler to use for ingest. If not provided, ``self.butler`` will
547  be used.
548 
549  Returns
550  -------
551  refs : `list` of `lsst.daf.butler.DatasetRef`
552  Dataset references for ingested raws.
553  """
554  if butler is None:
555  butler = self.butler
556  datasets = [FileDataset(path=os.path.abspath(file.filename),
557  refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
558  formatter=file.FormatterClass)
559  for file in exposure.files]
560  butler.ingest(*datasets, transfer=self.config.transfer)
561  return [ref for dataset in datasets for ref in dataset.refs]
562 
563  def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
564  """Ingest files into a Butler data repository.
565 
566  This creates any new exposure or visit Dimension entries needed to
567  identify the ingested files, creates new Dataset entries in the
568  Registry and finally ingests the files themselves into the Datastore.
569  Any needed instrument, detector, and physical_filter Dimension entries
570  must exist in the Registry before `run` is called.
571 
572  Parameters
573  ----------
574  files : iterable over `str` or path-like objects
575  Paths to the files to be ingested. Will be made absolute
576  if they are not already.
577  pool : `multiprocessing.Pool`, optional
578  If not `None`, a process pool with which to parallelize some
579  operations.
580  processes : `int`, optional
581  The number of processes to use. Ignored if ``pool`` is not `None`.
582 
583  Returns
584  -------
585  refs : `list` of `lsst.daf.butler.DatasetRef`
586  Dataset references for ingested raws.
587 
588  Notes
589  -----
590  This method inserts all records (dimensions and datasets) for an
591  exposure within a transaction, guaranteeing that partial exposures
592  are never ingested.
593  """
594  exposureData = self.prep(files, pool=pool, processes=processes)
595  # Up to this point, we haven't modified the data repository at all.
596  # Now we finally do that, with one transaction per exposure. This is
597  # not parallelized at present because the performance of this step is
598  # limited by the database server. That may or may not change in the
599  # future once we increase our usage of bulk inserts and reduce our
600  # usage of savepoints; we've tried to get everything but the database
601  # operations done in advance to reduce the time spent inside
602  # transactions.
603  self.butler.registry.registerDatasetType(self.datasetType)
604  refs = []
605  for exposure in exposureData:
606  with self.butler.transaction():
607  self.insertDimensionData(exposure.records)
608  refs.extend(self.ingestExposureDatasets(exposure))
609  return refs
def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass)
Definition: ingest.py:286
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: instrument.py:144
def _calculate_dataset_info(self, header, filename)
Definition: ingest.py:257
def makeVisitRecordFromObsInfo(obsInfo, universe, region=None)
Definition: instrument.py:177
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:127