lsst.obs.base  19.0.0
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 import itertools
27 from dataclasses import dataclass
28 from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections import defaultdict
30 from multiprocessing import Pool
31 
32 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33 from lsst.utils import doImport
34 from lsst.afw.fits import readMetadata
35 from lsst.daf.butler import (
36  Butler,
37  DataCoordinate,
38  DatasetRef,
39  DatasetType,
40  DimensionRecord,
41  FileDataset,
42 )
43 from lsst.obs.base.instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
44 from lsst.geom import Box2D
45 from lsst.pex.config import Config, Field, ChoiceField
46 from lsst.pipe.base import Task
47 from lsst.sphgeom import ConvexPolygon
48 
49 from .fitsRawFormatterBase import FitsRawFormatterBase
50 
51 
52 @dataclass
54  """Structure that holds information about a single raw file, used during
55  ingest.
56  """
57 
58  dataId: DataCoordinate
59  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
60 
61  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63  """
64 
65  obsInfo: ObservationInfo
66  """Standardized observation metadata extracted directly from the file
67  headers (`astro_metadata_translator.ObservationInfo`).
68  """
69 
70  region: ConvexPolygon
71  """Region on the sky covered by this file, possibly with padding
72  (`lsst.sphgeom.ConvexPolygon`).
73  """
74 
75  filename: str
76  """Name of the file this information was extracted from (`str`).
77 
78  This is the path prior to ingest, not the path after ingest.
79  """
80 
81  FormatterClass: Type[FitsRawFormatterBase]
82  """Formatter class that should be used to ingest this file and compute
83  a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
84  """
85 
86 
87 @dataclass
89  """Structure that holds information about a complete raw exposure, used
90  during ingest.
91  """
92 
93  dataId: DataCoordinate
94  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
95 
96  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
97  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
98  """
99 
100  files: List[RawFileData]
101  """List of structures containing file-level information.
102  """
103 
104  records: Optional[Dict[str, List[DimensionRecord]]] = None
105  """Dictionary containing `DimensionRecord` instances that must be inserted
106  into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
107 
108  Keys are the names of dimension elements ("exposure" and optionally "visit"
109  and "visit_detector_region"), while values are lists of `DimensionRecord`.
110 
111  May be `None` during some ingest steps.
112  """
113 
114 
115 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
116  """Create a Config field with options for how to transfer files between
117  data repositories.
118 
119  The allowed options for the field are exactly those supported by
120  `lsst.daf.butler.Datastore.ingest`.
121 
122  Parameters
123  ----------
124  doc : `str`
125  Documentation for the configuration field.
126 
127  Returns
128  -------
129  field : `lsst.pex.config.ChoiceField`
130  Configuration field.
131  """
132  return ChoiceField(
133  doc=doc,
134  dtype=str,
135  allowed={"move": "move",
136  "copy": "copy",
137  "hardlink": "hard link",
138  "symlink": "symbolic (soft) link"},
139  optional=True,
140  default=default
141  )
142 
143 
144 class RawIngestConfig(Config):
146  padRegionAmount = Field(
147  dtype=int,
148  default=0,
149  doc="Pad an image with specified number of pixels before calculating region"
150  )
151  instrument = Field(
152  doc=("Fully-qualified Python name of the `Instrument` subclass to "
153  "associate with all raws."),
154  dtype=str,
155  optional=False,
156  default=None,
157  )
158 
159 
160 class RawIngestTask(Task):
161  """Driver Task for ingesting raw data into Gen3 Butler repositories.
162 
163  This Task is intended to be runnable from the command-line, but it doesn't
164  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
165  gain much from being one. It also wouldn't really be appropriate as a
166  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
167  leverage the logging and configurability functionality that provides.
168 
169  Each instance of `RawIngestTask` writes to the same Butler. Each
170  invocation of `RawIngestTask.run` ingests a list of files.
171 
172  Parameters
173  ----------
174  config : `RawIngestConfig`
175  Configuration for the task.
176  butler : `~lsst.daf.butler.Butler`
177  Butler instance. Ingested Datasets will be created as part of
178  ``butler.run`` and associated with its Collection.
179  kwds
180  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
181  constructor.
182 
183  Other keyword arguments are forwarded to the Task base class constructor.
184  """
185 
186  ConfigClass = RawIngestConfig
187 
188  _DefaultName = "ingest"
189 
190  def getDatasetType(self):
191  """Return the DatasetType of the Datasets ingested by this Task.
192  """
193  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
194  universe=self.butler.registry.dimensions)
195 
196  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
197  super().__init__(config, **kwds)
198  self.butler = butler
199  self.universe = self.butler.registry.dimensions
200  self.instrument = doImport(self.config.instrument)()
201  # For now, we get a nominal Camera from the Instrument.
202  # In the future, we may want to load one from a Butler calibration
203  # collection that's appropriate for the observation timestamp of
204  # the exposure.
205  self.camera = self.instrument.getCamera()
207 
208  def extractMetadata(self, filename: str) -> RawFileData:
209  """Extract and process metadata from a single raw file.
210 
211  Parameters
212  ----------
213  filename : `str`
214  Path to the file.
215 
216  Returns
217  -------
218  data : `RawFileData`
219  A structure containing the metadata extracted from the file,
220  as well as the original filename. All fields will be populated,
221  but the `RawFileData.dataId` attribute will be a minimal
222  (unexpanded) `DataCoordinate` instance.
223  """
224  phdu = readMetadata(filename, 0)
225  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
226  fix_header(header)
227  obsInfo = ObservationInfo(header)
228  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
229  exposure=obsInfo.exposure_id,
230  detector=obsInfo.detector_num,
231  universe=self.universe)
232  if obsInfo.instrument != self.instrument.getName():
233  raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
234  f"got {obsInfo.instrument}) for file {filename}.")
235  FormatterClass = self.instrument.getRawFormatter(dataId)
236  if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
237  formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
238  visitInfo = formatter.makeVisitInfo()
239  detector = self.camera[obsInfo.detector_num]
240  wcs = formatter.makeWcs(visitInfo, detector)
241  pixBox = Box2D(detector.getBBox())
242  if self.config.padRegionAmount > 0:
243  pixBox.grow(self.config.padRegionAmount)
244  pixCorners = pixBox.getCorners()
245  sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
246  region = ConvexPolygon(sphCorners)
247  else:
248  region = None
249  return RawFileData(obsInfo=obsInfo, region=region, filename=filename,
250  FormatterClass=FormatterClass, dataId=dataId)
251 
252  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
253  """Group an iterable of `RawFileData` by exposure.
254 
255  Parameters
256  ----------
257  files : iterable of `RawFileData`
258  File-level information to group.
259 
260  Returns
261  -------
262  exposures : `list` of `RawExposureData`
263  A list of structures that group the file-level information by
264  exposure. The `RawExposureData.records` attributes of elements
265  will be `None`, but all other fields will be populated. The
266  `RawExposureData.dataId` attributes will be minimal (unexpanded)
267  `DataCoordinate` instances.
268  """
269  exposureDimensions = self.universe["exposure"].graph
270  byExposure = defaultdict(list)
271  for f in files:
272  byExposure[f.dataId.subset(exposureDimensions)].append(f)
273 
274  return [RawExposureData(dataId=dataId, files=exposureFiles)
275  for dataId, exposureFiles in byExposure.items()]
276 
277  def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
278  """Collect the `DimensionRecord` instances that must be inserted into
279  the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
280 
281  Parameters
282  ----------
283  exposure : `RawExposureData`
284  A structure containing information about the exposure to be
285  ingested. Should be considered consumed upon return.
286 
287  Returns
288  -------
289  exposure : `RawExposureData`
290  An updated version of the input structure, with
291  `RawExposureData.records` populated.
292  """
293  firstFile = exposure.files[0]
294  VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
295  exposure.records = {
296  "exposure": [makeExposureRecordFromObsInfo(firstFile.obsInfo, self.universe)],
297  }
298  if firstFile.obsInfo.visit_id is not None:
299  exposure.records["visit_detector_region"] = []
300  visitVertices = []
301  for file in exposure.files:
302  if file.obsInfo.visit_id != firstFile.obsInfo.visit_id:
303  raise ValueError(f"Inconsistent visit/exposure relationship for "
304  f"exposure {firstFile.obsInfo.exposure_id} between "
305  f"{file.filename} and {firstFile.filename}: "
306  f"{file.obsInfo.visit_id} != {firstFile.obsInfo.visit_id}.")
307  if file.region is None:
308  self.log.warn("No region found for visit=%s, detector=%s.", file.obsInfo.visit_id,
309  file.obsInfo.detector_num)
310  continue
311  visitVertices.extend(file.region.getVertices())
312  exposure.records["visit_detector_region"].append(
313  VisitDetectorRegionRecordClass.fromDict({
314  "instrument": file.obsInfo.instrument,
315  "visit": file.obsInfo.visit_id,
316  "detector": file.obsInfo.detector_num,
317  "region": file.region,
318  })
319  )
320  if visitVertices:
321  visitRegion = ConvexPolygon(visitVertices)
322  else:
323  self.log.warn("No region found for visit=%s.", file.obsInfo.visit_id,
324  file.obsInfo.detector_num)
325  visitRegion = None
326  exposure.records["visit"] = [
327  makeVisitRecordFromObsInfo(firstFile.obsInfo, self.universe, region=visitRegion)
328  ]
329  return exposure
330 
331  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
332  """Expand the data IDs associated with a raw exposure to include
333  additional metadata records.
334 
335  Parameters
336  ----------
337  exposure : `RawExposureData`
338  A structure containing information about the exposure to be
339  ingested. Must have `RawExposureData.records` populated. Should
340  be considered consumed upon return.
341 
342  Returns
343  -------
344  exposure : `RawExposureData`
345  An updated version of the input structure, with
346  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
347  containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
348  """
349  hasVisit = "visit" in data.records
350  # We start by expanded the exposure-level data ID; we won't use that
351  # directly in file ingest, but this lets us do some database lookups
352  # once per exposure instead of once per file later.
353  data.dataId = self.butler.registry.expandDataId(
354  data.dataId,
355  # We pass in the records we'll be inserting shortly so they aren't
356  # looked up from the database. We do expect instrument and filter
357  # records to be retrieved from the database here (though the
358  # Registry may cache them so there isn't a lookup every time).
359  records={
360  "exposure": data.records["exposure"][0],
361  "visit": data.records["visit"][0] if hasVisit else None,
362  }
363  )
364  # Now we expand the per-file (exposure+detector) data IDs. This time
365  # we pass in the records we just retrieved from the exposure data ID
366  # expansion as well as the visit_detector_region record, if there is
367  # one.
368  vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
369  for file, vdrRecord in zip(data.files, vdrRecords):
370  file.dataId = self.butler.registry.expandDataId(
371  file.dataId,
372  records=dict(data.dataId.records, visit_detector_region=vdrRecord)
373  )
374  return data
375 
376  def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
377  """Perform all ingest preprocessing steps that do not involve actually
378  modifying the database.
379 
380  Parameters
381  ----------
382  files : iterable over `str` or path-like objects
383  Paths to the files to be ingested. Will be made absolute
384  if they are not already.
385  pool : `multiprocessing.Pool`, optional
386  If not `None`, a process pool with which to parallelize some
387  operations.
388  processes : `int`, optional
389  The number of processes to use. Ignored if ``pool`` is not `None`.
390 
391  Yields
392  ------
393  exposure : `RawExposureData`
394  Data structures containing dimension records, filenames, and data
395  IDs to be ingested (one structure for each exposure).
396  """
397  if pool is None and processes > 1:
398  pool = Pool(processes)
399  mapFunc = map if pool is None else pool.imap_unordered
400 
401  # Extract metadata and build per-detector regions.
402  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
403 
404  # Use that metadata to group files (and extracted metadata) by
405  # exposure. Never parallelized because it's intrinsically a gather
406  # step.
407  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
408 
409  # The next few operations operate on RawExposureData instances (one at
410  # a time) in-place and then return the modified instance. We call them
411  # as pass-throughs instead of relying on the arguments we pass in to
412  # have been modified because in the parallel case those arguments are
413  # going to be pickled and unpickled, and I'm not certain
414  # multiprocessing is careful enough with that for output arguments to
415  # work. We use the same variable names to reflect the fact that we
416  # consider the arguments to have been consumed/invalidated.
417 
418  # Extract DimensionRecords from the metadata that will need to be
419  # inserted into the Registry before the raw datasets themselves are
420  # ingested.
421  exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
422 
423  # Expand the data IDs to include all dimension metadata; we need this
424  # because we may need to generate path templates that rely on that
425  # metadata.
426  # This is the first step that involves actual database calls (but just
427  # SELECTs), so if there's going to be a problem with connections vs.
428  # multiple processes, or lock contention (in SQLite) slowing things
429  # down, it'll happen here.
430  return mapFunc(self.expandDataIds, exposureData)
431 
432  def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
433  """Insert dimension records for one or more exposures.
434 
435  Parameters
436  ----------
437  records : `dict` mapping `str` to `list`
438  Dimension records to be inserted, organized as a mapping from
439  dimension name to a list of records for that dimension. This
440  may be a single `RawExposureData.records` dict, or an aggregate
441  for multiple exposures created by concatenating the value lists
442  of those dictionaries.
443 
444  Returns
445  -------
446  refs : `list` of `lsst.daf.butler.DatasetRef`
447  Dataset references for ingested raws.
448  """
449  # TODO: This currently assumes that either duplicate inserts of
450  # visit records are ignored, or there is exactly one visit per
451  # exposure. I expect us to switch up the visit-exposure
452  # relationship and hence rewrite some of this code before that
453  # becomes a practical problem.
454  # Iterate over dimensions explicitly to order for foreign key
455  # relationships.
456  for dimension in ("visit", "exposure", "visit_detector_region"):
457  recordsForDimension = records.get(dimension)
458  if recordsForDimension:
459  # TODO: once Registry has options to ignore or replace
460  # existing dimension records with the same primary keys
461  # instead of aborting on conflicts, add configuration
462  # options and logic to use them.
463  self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
464 
465  def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
466  ) -> List[DatasetRef]:
467  """Ingest all raw files in one exposure.
468 
469  Parameters
470  ----------
471  exposure : `RawExposureData`
472  A structure containing information about the exposure to be
473  ingested. Must have `RawExposureData.records` populated and all
474  data ID attributes expanded.
475  butler : `lsst.daf.butler.Butler`, optional
476  Butler to use for ingest. If not provided, ``self.butler`` will
477  be used.
478 
479  Returns
480  -------
481  refs : `list` of `lsst.daf.butler.DatasetRef`
482  Dataset references for ingested raws.
483  """
484  if butler is None:
485  butler = self.butler
486  datasets = [FileDataset(path=os.path.abspath(file.filename),
487  ref=DatasetRef(self.datasetType, file.dataId),
488  formatter=file.FormatterClass)
489  for file in exposure.files]
490  butler.ingest(*datasets, transfer=self.config.transfer)
491  return [dataset.ref for dataset in datasets]
492 
493  def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
494  """Ingest files into a Butler data repository.
495 
496  This creates any new exposure or visit Dimension entries needed to
497  identify the ingested files, creates new Dataset entries in the
498  Registry and finally ingests the files themselves into the Datastore.
499  Any needed instrument, detector, and physical_filter Dimension entries
500  must exist in the Registry before `run` is called.
501 
502  Parameters
503  ----------
504  files : iterable over `str` or path-like objects
505  Paths to the files to be ingested. Will be made absolute
506  if they are not already.
507  pool : `multiprocessing.Pool`, optional
508  If not `None`, a process pool with which to parallelize some
509  operations.
510  processes : `int`, optional
511  The number of processes to use. Ignored if ``pool`` is not `None`.
512 
513  Returns
514  -------
515  refs : `list` of `lsst.daf.butler.DatasetRef`
516  Dataset references for ingested raws.
517 
518  Notes
519  -----
520  This method inserts all records (dimensions and datasets) for an
521  exposure within a transaction, guaranteeing that partial exposures
522  are never ingested.
523  """
524  exposureData = self.prep(files, pool=pool, processes=processes)
525  # Up to this point, we haven't modified the data repository at all.
526  # Now we finally do that, with one transaction per exposure. This is
527  # not parallelized at present because the performance of this step is
528  # limited by the database server. That may or may not change in the
529  # future once we increase our usage of bulk inserts and reduce our
530  # usage of savepoints; we've tried to get everything but the database
531  # operations done in advance to reduce the time spent inside
532  # transactions.
533  self.butler.registry.registerDatasetType(self.datasetType)
534  refs = []
535  for exposure in exposureData:
536  with self.butler.transaction():
537  self.insertDimensionData(exposure.records)
538  refs.extend(self.ingestExposureDatasets(exposure))
539  return refs
def makeExposureRecordFromObsInfo(obsInfo, universe)
Definition: instrument.py:142
def makeVisitRecordFromObsInfo(obsInfo, universe, region=None)
Definition: instrument.py:175
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:115