lsst.obs.base  18.1.0-21-gde80ed3+5
ingest.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
24 
25 import os.path
26 import itertools
27 from dataclasses import dataclass
28 from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections import defaultdict
30 from multiprocessing import Pool
31 
32 from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33 from lsst.utils import doImport
34 from lsst.afw.fits import readMetadata
35 from lsst.daf.butler import (
36  Butler,
37  DataCoordinate,
38  DatasetRef,
39  DatasetType,
40  DimensionRecord,
41 )
42 from lsst.daf.butler.instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
43 from lsst.geom import Box2D
44 from lsst.pex.config import Config, Field, ChoiceField
45 from lsst.pipe.base import Task
46 from lsst.sphgeom import ConvexPolygon
47 
48 from .fitsRawFormatterBase import FitsRawFormatterBase
49 
50 
51 @dataclass
53  """Structure that holds information about a single raw file, used during
54  ingest.
55  """
56 
57  dataId: DataCoordinate
58  """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
59 
60  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
61  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
62  """
63 
64  obsInfo: ObservationInfo
65  """Standardized observation metadata extracted directly from the file
66  headers (`astro_metadata_translator.ObservationInfo`).
67  """
68 
69  region: ConvexPolygon
70  """Region on the sky covered by this file, possibly with padding
71  (`lsst.sphgeom.ConvexPolygon`).
72  """
73 
74  filename: str
75  """Name of the file this information was extracted from (`str`).
76 
77  This is the path prior to ingest, not the path after ingest.
78  """
79 
80  FormatterClass: Type[FitsRawFormatterBase]
81  """Formatter class that should be used to ingest this file and compute
82  a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
83  """
84 
85 
86 @dataclass
88  """Structure that holds information about a complete raw exposure, used
89  during ingest.
90  """
91 
92  dataId: DataCoordinate
93  """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
94 
95  This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
96  a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
97  """
98 
99  files: List[RawFileData]
100  """List of structures containing file-level information.
101  """
102 
103  records: Optional[Dict[str, List[DimensionRecord]]] = None
104  """Dictionary containing `DimensionRecord` instances that must be inserted
105  into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
106 
107  Keys are the names of dimension elements ("exposure" and optionally "visit"
108  and "visit_detector_region"), while values are lists of `DimensionRecord`.
109 
110  May be `None` during some ingest steps.
111  """
112 
113 
114 def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
115  """Create a Config field with options for how to transfer files between
116  data repositories.
117 
118  The allowed options for the field are exactly those supported by
119  `lsst.daf.butler.Datastore.ingest`.
120 
121  Parameters
122  ----------
123  doc : `str`
124  Documentation for the configuration field.
125 
126  Returns
127  -------
128  field : `lsst.pex.config.ChoiceField`
129  Configuration field.
130  """
131  return ChoiceField(
132  doc=doc,
133  dtype=str,
134  allowed={"move": "move",
135  "copy": "copy",
136  "hardlink": "hard link",
137  "symlink": "symbolic (soft) link"},
138  optional=True,
139  default=default
140  )
141 
142 
143 class RawIngestConfig(Config):
145  padRegionAmount = Field(
146  dtype=int,
147  default=0,
148  doc="Pad an image with specified number of pixels before calculating region"
149  )
150  instrument = Field(
151  doc=("Fully-qualified Python name of the `Instrument` subclass to "
152  "associate with all raws."),
153  dtype=str,
154  optional=False,
155  default=None,
156  )
157 
158 
159 class RawIngestTask(Task):
160  """Driver Task for ingesting raw data into Gen3 Butler repositories.
161 
162  This Task is intended to be runnable from the command-line, but it doesn't
163  meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
164  gain much from being one. It also wouldn't really be appropriate as a
165  subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
166  leverage the logging and configurability functionality that provides.
167 
168  Each instance of `RawIngestTask` writes to the same Butler. Each
169  invocation of `RawIngestTask.run` ingests a list of files.
170 
171  Parameters
172  ----------
173  config : `RawIngestConfig`
174  Configuration for the task.
175  butler : `~lsst.daf.butler.Butler`
176  Butler instance. Ingested Datasets will be created as part of
177  ``butler.run`` and associated with its Collection.
178  kwds
179  Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
180  constructor.
181 
182  Other keyword arguments are forwarded to the Task base class constructor.
183  """
184 
185  ConfigClass = RawIngestConfig
186 
187  _DefaultName = "ingest"
188 
189  def getDatasetType(self):
190  """Return the DatasetType of the Datasets ingested by this Task.
191  """
192  return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
193  universe=self.butler.registry.dimensions)
194 
195  def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
196  super().__init__(config, **kwds)
197  self.butler = butler
198  self.universe = self.butler.registry.dimensions
199  self.instrument = doImport(self.config.instrument)()
200  # For now, we get a nominal Camera from the Instrument.
201  # In the future, we may want to load one from a Butler calibration
202  # collection that's appropriate for the observation timestamp of
203  # the exposure.
204  self.camera = self.instrument.getCamera()
206 
207  def extractMetadata(self, filename: str) -> RawFileData:
208  """Extract and process metadata from a single raw file.
209 
210  Parameters
211  ----------
212  filename : `str`
213  Path to the file.
214 
215  Returns
216  -------
217  data : `RawFileData`
218  A structure containing the metadata extracted from the file,
219  as well as the original filename. All fields will be populated,
220  but the `RawFileData.dataId` attribute will be a minimal
221  (unexpanded) `DataCoordinate` instance.
222  """
223  phdu = readMetadata(filename, 0)
224  header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
225  fix_header(header)
226  obsInfo = ObservationInfo(header)
227  dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
228  exposure=obsInfo.exposure_id,
229  detector=obsInfo.detector_num,
230  universe=self.universe)
231  if obsInfo.instrument != self.instrument.getName():
232  raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
233  f"got {obsInfo.instrument}) for file {filename}.")
234  FormatterClass = self.instrument.getRawFormatter(dataId)
235  if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
236  formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
237  visitInfo = formatter.makeVisitInfo()
238  detector = self.camera[obsInfo.detector_num]
239  wcs = formatter.makeWcs(visitInfo, detector)
240  pixBox = Box2D(detector.getBBox())
241  if self.config.padRegionAmount > 0:
242  pixBox.grow(self.config.padRegionAmount)
243  pixCorners = pixBox.getCorners()
244  sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
245  region = ConvexPolygon(sphCorners)
246  else:
247  region = None
248  return RawFileData(obsInfo=obsInfo, region=region, filename=filename,
249  FormatterClass=FormatterClass, dataId=dataId)
250 
251  def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
252  """Group an iterable of `RawFileData` by exposure.
253 
254  Parameters
255  ----------
256  files : iterable of `RawFileData`
257  File-level information to group.
258 
259  Returns
260  -------
261  exposures : `list` of `RawExposureData`
262  A list of structures that group the file-level information by
263  exposure. The `RawExposureData.records` attributes of elements
264  will be `None`, but all other fields will be populated. The
265  `RawExposureData.dataId` attributes will be minimal (unexpanded)
266  `DataCoordinate` instances.
267  """
268  exposureDimensions = self.universe["exposure"].graph
269  byExposure = defaultdict(list)
270  for f in files:
271  byExposure[f.dataId.subset(exposureDimensions)].append(f)
272 
273  return [RawExposureData(dataId=dataId, files=exposureFiles)
274  for dataId, exposureFiles in byExposure.items()]
275 
276  def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
277  """Collect the `DimensionRecord` instances that must be inserted into
278  the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
279 
280  Parameters
281  ----------
282  exposure : `RawExposureData`
283  A structure containing information about the exposure to be
284  ingested. Should be considered consumed upon return.
285 
286  Returns
287  -------
288  exposure : `RawExposureData`
289  An updated version of the input structure, with
290  `RawExposureData.records` populated.
291  """
292  firstFile = exposure.files[0]
293  VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
294  exposure.records = {
295  "exposure": [makeExposureRecordFromObsInfo(firstFile.obsInfo, self.universe)],
296  }
297  if firstFile.obsInfo.visit_id is not None:
298  exposure.records["visit_detector_region"] = []
299  visitVertices = []
300  for file in exposure.files:
301  if file.obsInfo.visit_id != firstFile.obsInfo.visit_id:
302  raise ValueError(f"Inconsistent visit/exposure relationship for "
303  f"exposure {firstFile.obsInfo.exposure_id} between "
304  f"{file.filename} and {firstFile.filename}: "
305  f"{file.obsInfo.visit_id} != {firstFile.obsInfo.visit_id}.")
306  if file.region is None:
307  self.log.warn("No region found for visit=%s, detector=%s.", file.obsInfo.visit_id,
308  file.obsInfo.detector_num)
309  continue
310  visitVertices.extend(file.region.getVertices())
311  exposure.records["visit_detector_region"].append(
312  VisitDetectorRegionRecordClass.fromDict({
313  "instrument": file.obsInfo.instrument,
314  "visit": file.obsInfo.visit_id,
315  "detector": file.obsInfo.detector_num,
316  "region": file.region,
317  })
318  )
319  if visitVertices:
320  visitRegion = ConvexPolygon(visitVertices)
321  else:
322  self.log.warn("No region found for visit=%s.", file.obsInfo.visit_id,
323  file.obsInfo.detector_num)
324  visitRegion = None
325  exposure.records["visit"] = [
326  makeVisitRecordFromObsInfo(firstFile.obsInfo, self.universe, region=visitRegion)
327  ]
328  return exposure
329 
330  def expandDataIds(self, data: RawExposureData) -> RawExposureData:
331  """Expand the data IDs associated with a raw exposure to include
332  additional metadata records.
333 
334  Parameters
335  ----------
336  exposure : `RawExposureData`
337  A structure containing information about the exposure to be
338  ingested. Must have `RawExposureData.records` populated. Should
339  be considered consumed upon return.
340 
341  Returns
342  -------
343  exposure : `RawExposureData`
344  An updated version of the input structure, with
345  `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
346  containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
347  """
348  hasVisit = "visit" in data.records
349  # We start by expanded the exposure-level data ID; we won't use that
350  # directly in file ingest, but this lets us do some database lookups
351  # once per exposure instead of once per file later.
352  data.dataId = self.butler.registry.expandDataId(
353  data.dataId,
354  # We pass in the records we'll be inserting shortly so they aren't
355  # looked up from the database. We do expect instrument and filter
356  # records to be retrieved from the database here (though the
357  # Registry may cache them so there isn't a lookup every time).
358  records={
359  "exposure": data.records["exposure"][0],
360  "visit": data.records["visit"][0] if hasVisit else None,
361  }
362  )
363  # Now we expand the per-file (exposure+detector) data IDs. This time
364  # we pass in the records we just retrieved from the exposure data ID
365  # expansion as well as the visit_detector_region record, if there is
366  # one.
367  vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
368  for file, vdrRecord in zip(data.files, vdrRecords):
369  file.dataId = self.butler.registry.expandDataId(
370  file.dataId,
371  records=dict(data.dataId.records, visit_detector_region=vdrRecord)
372  )
373  return data
374 
375  def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
376  """Perform all ingest preprocessing steps that do not involve actually
377  modifying the database.
378 
379  Parameters
380  ----------
381  files : iterable over `str` or path-like objects
382  Paths to the files to be ingested. Will be made absolute
383  if they are not already.
384  pool : `multiprocessing.Pool`, optional
385  If not `None`, a process pool with which to parallelize some
386  operations.
387  processes : `int`, optional
388  The number of processes to use. Ignored if ``pool`` is not `None`.
389 
390  Yields
391  ------
392  exposure : `RawExposureData`
393  Data structures containing dimension records, filenames, and data
394  IDs to be ingested (one structure for each exposure).
395  """
396  if pool is None and processes > 1:
397  pool = Pool(processes)
398  mapFunc = map if pool is None else pool.imap_unordered
399 
400  # Extract metadata and build per-detector regions.
401  fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
402 
403  # Use that metadata to group files (and extracted metadata) by
404  # exposure. Never parallelized because it's intrinsically a gather
405  # step.
406  exposureData: List[RawExposureData] = self.groupByExposure(fileData)
407 
408  # The next few operations operate on RawExposureData instances (one at
409  # a time) in-place and then return the modified instance. We call them
410  # as pass-throughs instead of relying on the arguments we pass in to
411  # have been modified because in the parallel case those arguments are
412  # going to be pickled and unpickled, and I'm not certain
413  # multiprocessing is careful enough with that for output arguments to
414  # work. We use the same variable names to reflect the fact that we
415  # consider the arguments to have been consumed/invalidated.
416 
417  # Extract DimensionRecords from the metadata that will need to be
418  # inserted into the Registry before the raw datasets themselves are
419  # ingested.
420  exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
421 
422  # Expand the data IDs to include all dimension metadata; we need this
423  # because we may need to generate path templates that rely on that
424  # metadata.
425  # This is the first step that involves actual database calls (but just
426  # SELECTs), so if there's going to be a problem with connections vs.
427  # multiple processes, or lock contention (in SQLite) slowing things
428  # down, it'll happen here.
429  return mapFunc(self.expandDataIds, exposureData)
430 
431  def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
432  """Insert dimension records for one or more exposures.
433 
434  Parameters
435  ----------
436  records : `dict` mapping `str` to `list`
437  Dimension records to be inserted, organized as a mapping from
438  dimension name to a list of records for that dimension. This
439  may be a single `RawExposureData.records` dict, or an aggregate
440  for multiple exposures created by concatenating the value lists
441  of those dictionaries.
442 
443  Returns
444  -------
445  refs : `list` of `lsst.daf.butler.DatasetRef`
446  Dataset references for ingested raws.
447  """
448  # TODO: This currently assumes that either duplicate inserts of
449  # visit records are ignored, or there is exactly one visit per
450  # exposure. I expect us to switch up the visit-exposure
451  # relationship and hence rewrite some of this code before that
452  # becomes a practical problem.
453  # Iterate over dimensions explicitly to order for foreign key
454  # relationships.
455  for dimension in ("visit", "exposure", "visit_detector_region"):
456  recordsForDimension = records.get(dimension)
457  if recordsForDimension:
458  # TODO: once Registry has options to ignore or replace
459  # existing dimension records with the same primary keys
460  # instead of aborting on conflicts, add configuration
461  # options and logic to use them.
462  self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
463 
464  def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
465  ) -> List[DatasetRef]:
466  """Ingest all raw files in one exposure.
467 
468  Parameters
469  ----------
470  exposure : `RawExposureData`
471  A structure containing information about the exposure to be
472  ingested. Must have `RawExposureData.records` populated and all
473  data ID attributes expanded.
474  butler : `lsst.daf.butler.Butler`, optional
475  Butler to use for ingest. If not provided, ``self.butler`` will
476  be used.
477 
478  Returns
479  -------
480  refs : `list` of `lsst.daf.butler.DatasetRef`
481  Dataset references for ingested raws.
482  """
483  # TODO: once Butler has the ability to do bulk inserts of
484  # dataset rows (or at least avoid per-dataset savepoints),
485  # use that.
486  refs = []
487  if butler is None:
488  butler = self.butler
489  for file in exposure.files:
490  path = os.path.abspath(file.filename)
491  ref = butler.ingest(path, self.datasetType, file.dataId,
492  transfer=self.config.transfer,
493  formatter=file.FormatterClass)
494  refs.append(ref)
495  return refs
496 
497  def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
498  """Ingest files into a Butler data repository.
499 
500  This creates any new exposure or visit Dimension entries needed to
501  identify the ingested files, creates new Dataset entries in the
502  Registry and finally ingests the files themselves into the Datastore.
503  Any needed instrument, detector, and physical_filter Dimension entries
504  must exist in the Registry before `run` is called.
505 
506  Parameters
507  ----------
508  files : iterable over `str` or path-like objects
509  Paths to the files to be ingested. Will be made absolute
510  if they are not already.
511  pool : `multiprocessing.Pool`, optional
512  If not `None`, a process pool with which to parallelize some
513  operations.
514  processes : `int`, optional
515  The number of processes to use. Ignored if ``pool`` is not `None`.
516 
517  Returns
518  -------
519  refs : `list` of `lsst.daf.butler.DatasetRef`
520  Dataset references for ingested raws.
521 
522  Notes
523  -----
524  This method inserts all records (dimensions and datasets) for an
525  exposure within a transaction, guaranteeing that partial exposures
526  are never ingested.
527  """
528  exposureData = self.prep(files, pool=pool, processes=processes)
529  # Up to this point, we haven't modified the data repository at all.
530  # Now we finally do that, with one transaction per exposure. This is
531  # not parallelized at present because the performance of this step is
532  # limited by the database server. That may or may not change in the
533  # future once we increase our usage of bulk inserts and reduce our
534  # usage of savepoints; we've tried to get everything but the database
535  # operations done in advance to reduce the time spent inside
536  # transactions.
537  self.butler.registry.registerDatasetType(self.datasetType)
538  refs = []
539  for exposure in exposureData:
540  with self.butler.transaction():
541  self.insertDimensionData(exposure.records)
542  refs.extend(self.ingestExposureDatasets(exposure))
543  return refs
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:114