lsst.obs.base  19.0.0-40-gd9b8072
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 import itertools
27 from dataclasses import dataclass
28 from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections import defaultdict
30 from multiprocessing import Pool
31 
32 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33 from lsst.utils import doImport
34 from lsst.afw.fits import readMetadata
35 from lsst.daf.butler import (
36  Butler,
37  DataCoordinate,
38  DatasetRef,
39  DatasetType,
40  DimensionRecord,
41  FileDataset,
42 )
43 from lsst.geom import Box2D
44 from lsst.pex.config import Config, Field, ChoiceField
45 from lsst.pipe.base import Task
46 from lsst.sphgeom import ConvexPolygon
47 
48 from .instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
49 from .fitsRawFormatterBase import FitsRawFormatterBase
50 
51 
52 @dataclass
54  """Structure that hold information about a single dataset within a
55  raw file.
56  """
57 
58  dataId: DataCoordinate
59  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
60 
61  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63  """
64 
65  obsInfo: ObservationInfo
66  """Standardized observation metadata extracted directly from the file
67  headers (`astro_metadata_translator.ObservationInfo`).
68  """
69 
70  region: ConvexPolygon
71  """Region on the sky covered by this file, possibly with padding
72  (`lsst.sphgeom.ConvexPolygon`).
73  """
74 
75 
76 @dataclass
78  """Structure that holds information about a single raw file, used during
79  ingest.
80  """
81 
82  datasets: List[RawFileDatasetInfo]
83  """The information describing each dataset within this raw file.
84  (`list` of `RawFileDatasetInfo`)
85  """
86 
87  filename: str
88  """Name of the file this information was extracted from (`str`).
89 
90  This is the path prior to ingest, not the path after ingest.
91  """
92 
93  FormatterClass: Type[FitsRawFormatterBase]
94  """Formatter class that should be used to ingest this file and compute
95  a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
96  """
97 
98 
99 @dataclass
101  """Structure that holds information about a complete raw exposure, used
102  during ingest.
103  """
104 
105  dataId: DataCoordinate
106  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
107 
108  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
109  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
110  """
111 
112  files: List[RawFileData]
113  """List of structures containing file-level information.
114  """
115 
116  records: Optional[Dict[str, List[DimensionRecord]]] = None
117  """Dictionary containing `DimensionRecord` instances that must be inserted
118  into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
119 
120  Keys are the names of dimension elements ("exposure" and optionally "visit"
121  and "visit_detector_region"), while values are lists of `DimensionRecord`.
122 
123  May be `None` during some ingest steps.
124  """
125 
126 
127 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
128  """Create a Config field with options for how to transfer files between
129  data repositories.
130 
131  The allowed options for the field are exactly those supported by
132  `lsst.daf.butler.Datastore.ingest`.
133 
134  Parameters
135  ----------
136  doc : `str`
137  Documentation for the configuration field.
138 
139  Returns
140  -------
141  field : `lsst.pex.config.ChoiceField`
142  Configuration field.
143  """
144  return ChoiceField(
145  doc=doc,
146  dtype=str,
147  allowed={"move": "move",
148  "copy": "copy",
149  "auto": "choice will depend on datastore",
150  "link": "hard link falling back to symbolic link",
151  "hardlink": "hard link",
152  "symlink": "symbolic (soft) link",
153  "relsymlink": "relative symbolic link",
154  },
155  optional=True,
156  default=default
157  )
158 
159 
160 class RawIngestConfig(Config):
162  padRegionAmount = Field(
163  dtype=int,
164  default=0,
165  doc="Pad an image with specified number of pixels before calculating region"
166  )
167  instrument = Field(
168  doc=("Fully-qualified Python name of the `Instrument` subclass to "
169  "associate with all raws."),
170  dtype=str,
171  optional=False,
172  default=None,
173  )
174 
175 
176 class RawIngestTask(Task):
177  """Driver Task for ingesting raw data into Gen3 Butler repositories.
178 
179  This Task is intended to be runnable from the command-line, but it doesn't
180  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
181  gain much from being one. It also wouldn't really be appropriate as a
182  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
183  leverage the logging and configurability functionality that provides.
184 
185  Each instance of `RawIngestTask` writes to the same Butler. Each
186  invocation of `RawIngestTask.run` ingests a list of files.
187 
188  Parameters
189  ----------
190  config : `RawIngestConfig`
191  Configuration for the task.
192  butler : `~lsst.daf.butler.Butler`
193  Butler instance. Ingested Datasets will be created as part of
194  ``butler.run`` and associated with its Collection.
195  kwds
196  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
197  constructor.
198 
199  Other keyword arguments are forwarded to the Task base class constructor.
200  """
201 
202  ConfigClass = RawIngestConfig
203 
204  _DefaultName = "ingest"
205 
206  def getDatasetType(self):
207  """Return the DatasetType of the Datasets ingested by this Task.
208  """
209  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
210  universe=self.butler.registry.dimensions)
211 
212  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
213  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
214  super().__init__(config, **kwds)
215  self.butler = butler
216  self.universe = self.butler.registry.dimensions
217  self.instrument = doImport(self.config.instrument)()
218  # For now, we get a nominal Camera from the Instrument.
219  # In the future, we may want to load one from a Butler calibration
220  # collection that's appropriate for the observation timestamp of
221  # the exposure.
222  self.camera = self.instrument.getCamera()
224 
225  def extractMetadata(self, filename: str) -> RawFileData:
226  """Extract and process metadata from a single raw file.
227 
228  Parameters
229  ----------
230  filename : `str`
231  Path to the file.
232 
233  Returns
234  -------
235  data : `RawFileData`
236  A structure containing the metadata extracted from the file,
237  as well as the original filename. All fields will be populated,
238  but the `RawFileData.dataId` attribute will be a minimal
239  (unexpanded) `DataCoordinate` instance.
240 
241  Notes
242  -----
243  Assumes that there is a single dataset associated with the given
244  file. Instruments using a single file to store multiple datasets
245  must implement their own version of this method.
246  """
247  # Manually merge the primary and "first data" headers here because we
248  # do not know in general if an input file has set INHERIT=T.
249  phdu = readMetadata(filename, 0)
250  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
251  fix_header(header)
252  datasets = [self._calculate_dataset_info(header, filename)]
253 
254  # The data model currently assumes that whilst multiple datasets
255  # can be associated with a single file, they must all share the
256  # same formatter.
257  FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId)
258 
259  return RawFileData(datasets=datasets, filename=filename,
260  FormatterClass=FormatterClass)
261 
262  def _calculate_dataset_info(self, header, filename):
263  """Calculate a RawFileDatasetInfo from the supplied information.
264 
265  Parameters
266  ----------
267  header : `Mapping`
268  Header from the dataset.
269  filename : `str`
270  Filename to use for error messages.
271 
272  Returns
273  -------
274  dataset : `RawFileDatasetInfo`
275  The region, dataId, and observation information associated with
276  this dataset.
277  """
278  obsInfo = ObservationInfo(header)
279  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
280  exposure=obsInfo.exposure_id,
281  detector=obsInfo.detector_num,
282  universe=self.universe)
283  if obsInfo.instrument != self.instrument.getName():
284  raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
285  f"got {obsInfo.instrument}) for file {filename}.")
286 
287  FormatterClass = self.instrument.getRawFormatter(dataId)
288  region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass)
289  return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)
290 
291  def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
292  """Calculate the sky region covered by the supplied observation
293  information.
294 
295  Parameters
296  ----------
297  obsInfo : `~astro_metadata_translator.ObservationInfo`
298  Summary information of this dataset.
299  header : `Mapping`
300  Header from the dataset.
301  FormatterClass: `type` as subclass of `FitsRawFormatterBase`
302  Formatter class that should be used to compute the spatial region.
303 
304  Returns
305  -------
306  region : `lsst.sphgeom.ConvexPolygon`
307  Region of sky covered by this observation.
308  """
309  if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
310  formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
311  visitInfo = formatter.makeVisitInfo()
312  detector = self.camera[obsInfo.detector_num]
313  wcs = formatter.makeWcs(visitInfo, detector)
314  pixBox = Box2D(detector.getBBox())
315  if self.config.padRegionAmount > 0:
316  pixBox.grow(self.config.padRegionAmount)
317  pixCorners = pixBox.getCorners()
318  sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
319  region = ConvexPolygon(sphCorners)
320  else:
321  region = None
322  return region
323 
324  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
325  """Group an iterable of `RawFileData` by exposure.
326 
327  Parameters
328  ----------
329  files : iterable of `RawFileData`
330  File-level information to group.
331 
332  Returns
333  -------
334  exposures : `list` of `RawExposureData`
335  A list of structures that group the file-level information by
336  exposure. The `RawExposureData.records` attributes of elements
337  will be `None`, but all other fields will be populated. The
338  `RawExposureData.dataId` attributes will be minimal (unexpanded)
339  `DataCoordinate` instances.
340  """
341  exposureDimensions = self.universe["exposure"].graph
342  byExposure = defaultdict(list)
343  for f in files:
344  # Assume that the first dataset is representative for the file
345  byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
346 
347  return [RawExposureData(dataId=dataId, files=exposureFiles)
348  for dataId, exposureFiles in byExposure.items()]
349 
350  def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
351  """Collect the `DimensionRecord` instances that must be inserted into
352  the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
353 
354  Parameters
355  ----------
356  exposure : `RawExposureData`
357  A structure containing information about the exposure to be
358  ingested. Should be considered consumed upon return.
359 
360  Returns
361  -------
362  exposure : `RawExposureData`
363  An updated version of the input structure, with
364  `RawExposureData.records` populated.
365  """
366  firstFile = exposure.files[0]
367  firstDataset = firstFile.datasets[0]
368  VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
369  exposure.records = {
370  "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)],
371  }
372  if firstDataset.obsInfo.visit_id is not None:
373  exposure.records["visit_detector_region"] = []
374  visitVertices = []
375  for file in exposure.files:
376  for dataset in file.datasets:
377  if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
378  raise ValueError(f"Inconsistent visit/exposure relationship for "
379  f"exposure {firstDataset.obsInfo.exposure_id} between "
380  f"{file.filename} and {firstFile.filename}: "
381  f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
382  if dataset.region is None:
383  self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
384  dataset.obsInfo.detector_num)
385  continue
386  visitVertices.extend(dataset.region.getVertices())
387  exposure.records["visit_detector_region"].append(
388  VisitDetectorRegionRecordClass.fromDict({
389  "instrument": dataset.obsInfo.instrument,
390  "visit": dataset.obsInfo.visit_id,
391  "detector": dataset.obsInfo.detector_num,
392  "region": dataset.region,
393  })
394  )
395  if visitVertices:
396  visitRegion = ConvexPolygon(visitVertices)
397  else:
398  self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id)
399  visitRegion = None
400  exposure.records["visit"] = [
401  makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion)
402  ]
403  return exposure
404 
405  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
406  """Expand the data IDs associated with a raw exposure to include
407  additional metadata records.
408 
409  Parameters
410  ----------
411  exposure : `RawExposureData`
412  A structure containing information about the exposure to be
413  ingested. Must have `RawExposureData.records` populated. Should
414  be considered consumed upon return.
415 
416  Returns
417  -------
418  exposure : `RawExposureData`
419  An updated version of the input structure, with
420  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
421  containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
422  """
423  hasVisit = "visit" in data.records
424  # We start by expanded the exposure-level data ID; we won't use that
425  # directly in file ingest, but this lets us do some database lookups
426  # once per exposure instead of once per file later.
427  data.dataId = self.butler.registry.expandDataId(
428  data.dataId,
429  # We pass in the records we'll be inserting shortly so they aren't
430  # looked up from the database. We do expect instrument and filter
431  # records to be retrieved from the database here (though the
432  # Registry may cache them so there isn't a lookup every time).
433  records={
434  "exposure": data.records["exposure"][0],
435  "visit": data.records["visit"][0] if hasVisit else None,
436  }
437  )
438  # Now we expand the per-file (exposure+detector) data IDs. This time
439  # we pass in the records we just retrieved from the exposure data ID
440  # expansion as well as the visit_detector_region record, if there is
441  # one.
442  vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
443  for file, vdrRecord in zip(data.files, vdrRecords):
444  for dataset in file.datasets:
445  dataset.dataId = self.butler.registry.expandDataId(
446  dataset.dataId,
447  records=dict(data.dataId.records, visit_detector_region=vdrRecord)
448  )
449  return data
450 
451  def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
452  """Perform all ingest preprocessing steps that do not involve actually
453  modifying the database.
454 
455  Parameters
456  ----------
457  files : iterable over `str` or path-like objects
458  Paths to the files to be ingested. Will be made absolute
459  if they are not already.
460  pool : `multiprocessing.Pool`, optional
461  If not `None`, a process pool with which to parallelize some
462  operations.
463  processes : `int`, optional
464  The number of processes to use. Ignored if ``pool`` is not `None`.
465 
466  Yields
467  ------
468  exposure : `RawExposureData`
469  Data structures containing dimension records, filenames, and data
470  IDs to be ingested (one structure for each exposure).
471  """
472  if pool is None and processes > 1:
473  pool = Pool(processes)
474  mapFunc = map if pool is None else pool.imap_unordered
475 
476  # Extract metadata and build per-detector regions.
477  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
478 
479  # Use that metadata to group files (and extracted metadata) by
480  # exposure. Never parallelized because it's intrinsically a gather
481  # step.
482  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
483 
484  # The next few operations operate on RawExposureData instances (one at
485  # a time) in-place and then return the modified instance. We call them
486  # as pass-throughs instead of relying on the arguments we pass in to
487  # have been modified because in the parallel case those arguments are
488  # going to be pickled and unpickled, and I'm not certain
489  # multiprocessing is careful enough with that for output arguments to
490  # work. We use the same variable names to reflect the fact that we
491  # consider the arguments to have been consumed/invalidated.
492 
493  # Extract DimensionRecords from the metadata that will need to be
494  # inserted into the Registry before the raw datasets themselves are
495  # ingested.
496  exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
497 
498  # Expand the data IDs to include all dimension metadata; we need this
499  # because we may need to generate path templates that rely on that
500  # metadata.
501  # This is the first step that involves actual database calls (but just
502  # SELECTs), so if there's going to be a problem with connections vs.
503  # multiple processes, or lock contention (in SQLite) slowing things
504  # down, it'll happen here.
505  return mapFunc(self.expandDataIds, exposureData)
506 
507  def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
508  """Insert dimension records for one or more exposures.
509 
510  Parameters
511  ----------
512  records : `dict` mapping `str` to `list`
513  Dimension records to be inserted, organized as a mapping from
514  dimension name to a list of records for that dimension. This
515  may be a single `RawExposureData.records` dict, or an aggregate
516  for multiple exposures created by concatenating the value lists
517  of those dictionaries.
518 
519  Returns
520  -------
521  refs : `list` of `lsst.daf.butler.DatasetRef`
522  Dataset references for ingested raws.
523  """
524  # TODO: This currently assumes that either duplicate inserts of
525  # visit records are ignored, or there is exactly one visit per
526  # exposure. I expect us to switch up the visit-exposure
527  # relationship and hence rewrite some of this code before that
528  # becomes a practical problem.
529  # Iterate over dimensions explicitly to order for foreign key
530  # relationships.
531  for dimension in ("visit", "exposure", "visit_detector_region"):
532  recordsForDimension = records.get(dimension)
533  if recordsForDimension:
534  # TODO: once Registry has options to ignore or replace
535  # existing dimension records with the same primary keys
536  # instead of aborting on conflicts, add configuration
537  # options and logic to use them.
538  self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
539 
540  def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
541  ) -> List[DatasetRef]:
542  """Ingest all raw files in one exposure.
543 
544  Parameters
545  ----------
546  exposure : `RawExposureData`
547  A structure containing information about the exposure to be
548  ingested. Must have `RawExposureData.records` populated and all
549  data ID attributes expanded.
550  butler : `lsst.daf.butler.Butler`, optional
551  Butler to use for ingest. If not provided, ``self.butler`` will
552  be used.
553 
554  Returns
555  -------
556  refs : `list` of `lsst.daf.butler.DatasetRef`
557  Dataset references for ingested raws.
558  """
559  if butler is None:
560  butler = self.butler
561  datasets = [FileDataset(path=os.path.abspath(file.filename),
562  refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
563  formatter=file.FormatterClass)
564  for file in exposure.files]
565  butler.ingest(*datasets, transfer=self.config.transfer)
566  return [ref for dataset in datasets for ref in dataset.refs]
567 
568  def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
569  """Ingest files into a Butler data repository.
570 
571  This creates any new exposure or visit Dimension entries needed to
572  identify the ingested files, creates new Dataset entries in the
573  Registry and finally ingests the files themselves into the Datastore.
574  Any needed instrument, detector, and physical_filter Dimension entries
575  must exist in the Registry before `run` is called.
576 
577  Parameters
578  ----------
579  files : iterable over `str` or path-like objects
580  Paths to the files to be ingested. Will be made absolute
581  if they are not already.
582  pool : `multiprocessing.Pool`, optional
583  If not `None`, a process pool with which to parallelize some
584  operations.
585  processes : `int`, optional
586  The number of processes to use. Ignored if ``pool`` is not `None`.
587 
588  Returns
589  -------
590  refs : `list` of `lsst.daf.butler.DatasetRef`
591  Dataset references for ingested raws.
592 
593  Notes
594  -----
595  This method inserts all records (dimensions and datasets) for an
596  exposure within a transaction, guaranteeing that partial exposures
597  are never ingested.
598  """
599  exposureData = self.prep(files, pool=pool, processes=processes)
600  # Up to this point, we haven't modified the data repository at all.
601  # Now we finally do that, with one transaction per exposure. This is
602  # not parallelized at present because the performance of this step is
603  # limited by the database server. That may or may not change in the
604  # future once we increase our usage of bulk inserts and reduce our
605  # usage of savepoints; we've tried to get everything but the database
606  # operations done in advance to reduce the time spent inside
607  # transactions.
608  self.butler.registry.registerDatasetType(self.datasetType)
609  refs = []
610  for exposure in exposureData:
611  with self.butler.transaction():
612  self.insertDimensionData(exposure.records)
613  refs.extend(self.ingestExposureDatasets(exposure))
614  return refs
def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass)
Definition: ingest.py:291
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: instrument.py:303
def _calculate_dataset_info(self, header, filename)
Definition: ingest.py:262
def makeVisitRecordFromObsInfo(obsInfo, universe, region=None)
Definition: instrument.py:337
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:127