23 __all__ = (
"RawIngestTask",
"RawIngestConfig",
"makeTransferChoiceField")
27 from dataclasses
import dataclass
28 from typing
import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections
import defaultdict
30 from multiprocessing
import Pool
32 from astro_metadata_translator
import ObservationInfo, fix_header, merge_headers
34 from lsst.afw.fits
import readMetadata
35 from lsst.daf.butler
import (
44 from lsst.geom
import Box2D
45 from lsst.pex.config
import Config, Field, ChoiceField
46 from lsst.pipe.base
import Task
47 from lsst.sphgeom
import ConvexPolygon
49 from .fitsRawFormatterBase
import FitsRawFormatterBase
54 """Structure that hold information about a single dataset within a
58 dataId: DataCoordinate
59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
65 obsInfo: ObservationInfo
66 """Standardized observation metadata extracted directly from the file
67 headers (`astro_metadata_translator.ObservationInfo`).
71 """Region on the sky covered by this file, possibly with padding
72 (`lsst.sphgeom.ConvexPolygon`).
78 """Structure that holds information about a single raw file, used during
82 datasets: List[RawFileDatasetInfo]
83 """The information describing each dataset within this raw file.
84 (`list` of `RawFileDatasetInfo`)
88 """Name of the file this information was extracted from (`str`).
90 This is the path prior to ingest, not the path after ingest.
93 FormatterClass: Type[FitsRawFormatterBase]
94 """Formatter class that should be used to ingest this file and compute
95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
101 """Structure that holds information about a complete raw exposure, used
105 dataId: DataCoordinate
106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
112 files: List[RawFileData]
113 """List of structures containing file-level information.
116 records: Optional[Dict[str, List[DimensionRecord]]] =
None
117 """Dictionary containing `DimensionRecord` instances that must be inserted
118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
120 Keys are the names of dimension elements ("exposure" and optionally "visit"
121 and "visit_detector_region"), while values are lists of `DimensionRecord`.
123 May be `None` during some ingest steps.
128 """Create a Config field with options for how to transfer files between
131 The allowed options for the field are exactly those supported by
132 `lsst.daf.butler.Datastore.ingest`.
137 Documentation for the configuration field.
141 field : `lsst.pex.config.ChoiceField`
147 allowed={
"move":
"move",
149 "hardlink":
"hard link",
150 "symlink":
"symbolic (soft) link"},
158 padRegionAmount = Field(
161 doc=
"Pad an image with specified number of pixels before calculating region"
164 doc=(
"Fully-qualified Python name of the `Instrument` subclass to "
165 "associate with all raws."),
173 """Driver Task for ingesting raw data into Gen3 Butler repositories.
175 This Task is intended to be runnable from the command-line, but it doesn't
176 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
177 gain much from being one. It also wouldn't really be appropriate as a
178 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
179 leverage the logging and configurability functionality that provides.
181 Each instance of `RawIngestTask` writes to the same Butler. Each
182 invocation of `RawIngestTask.run` ingests a list of files.
186 config : `RawIngestConfig`
187 Configuration for the task.
188 butler : `~lsst.daf.butler.Butler`
189 Butler instance. Ingested Datasets will be created as part of
190 ``butler.run`` and associated with its Collection.
192 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
195 Other keyword arguments are forwarded to the Task base class constructor.
198 ConfigClass = RawIngestConfig
200 _DefaultName =
"ingest"
203 """Return the DatasetType of the Datasets ingested by this Task.
205 return DatasetType(
"raw", (
"instrument",
"detector",
"exposure"),
"Exposure",
206 universe=self.
butler.registry.dimensions)
208 def __init__(self, config: Optional[RawIngestConfig] =
None, *, butler: Butler, **kwds: Any):
221 """Extract and process metadata from a single raw file.
231 A structure containing the metadata extracted from the file,
232 as well as the original filename. All fields will be populated,
233 but the `RawFileData.dataId` attribute will be a minimal
234 (unexpanded) `DataCoordinate` instance.
238 Assumes that there is a single dataset associated with the given
239 file. Instruments using a single file to store multiple datasets
240 must implement their own version of this method.
244 phdu = readMetadata(filename, 0)
245 header = merge_headers([phdu, readMetadata(filename)], mode=
"overwrite")
252 FormatterClass = self.
instrument.getRawFormatter(datasets[0].dataId)
254 return RawFileData(datasets=datasets, filename=filename,
255 FormatterClass=FormatterClass)
257 def _calculate_dataset_info(self, header, filename):
258 """Calculate a RawFileDatasetInfo from the supplied information.
263 Header from the dataset.
265 Filename to use for error messages.
269 dataset : `RawFileDatasetInfo`
270 The region, dataId, and observation information associated with
273 obsInfo = ObservationInfo(header)
274 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
275 exposure=obsInfo.exposure_id,
276 detector=obsInfo.detector_num,
278 if obsInfo.instrument != self.
instrument.getName():
279 raise ValueError(f
"Incorrect instrument (expected {self.instrument.getName()}, "
280 f
"got {obsInfo.instrument}) for file {filename}.")
282 FormatterClass = self.
instrument.getRawFormatter(dataId)
286 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
287 """Calculate the sky region covered by the supplied observation
292 obsInfo : `~astro_metadata_translator.ObservationInfo`
293 Summary information of this dataset.
295 Header from the dataset.
296 FormatterClass: `type` as subclass of `FitsRawFormatterBase`
297 Formatter class that should be used to compute the spatial region.
301 region : `lsst.sphgeom.ConvexPolygon`
302 Region of sky covered by this observation.
304 if obsInfo.visit_id
is not None and obsInfo.tracking_radec
is not None:
305 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
306 visitInfo = formatter.makeVisitInfo()
307 detector = self.
camera[obsInfo.detector_num]
308 wcs = formatter.makeWcs(visitInfo, detector)
309 pixBox = Box2D(detector.getBBox())
310 if self.config.padRegionAmount > 0:
311 pixBox.grow(self.config.padRegionAmount)
312 pixCorners = pixBox.getCorners()
313 sphCorners = [wcs.pixelToSky(point).getVector()
for point
in pixCorners]
314 region = ConvexPolygon(sphCorners)
320 """Group an iterable of `RawFileData` by exposure.
324 files : iterable of `RawFileData`
325 File-level information to group.
329 exposures : `list` of `RawExposureData`
330 A list of structures that group the file-level information by
331 exposure. The `RawExposureData.records` attributes of elements
332 will be `None`, but all other fields will be populated. The
333 `RawExposureData.dataId` attributes will be minimal (unexpanded)
334 `DataCoordinate` instances.
336 exposureDimensions = self.
universe[
"exposure"].graph
337 byExposure = defaultdict(list)
340 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
343 for dataId, exposureFiles
in byExposure.items()]
346 """Collect the `DimensionRecord` instances that must be inserted into
347 the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
351 exposure : `RawExposureData`
352 A structure containing information about the exposure to be
353 ingested. Should be considered consumed upon return.
357 exposure : `RawExposureData`
358 An updated version of the input structure, with
359 `RawExposureData.records` populated.
361 firstFile = exposure.files[0]
362 firstDataset = firstFile.datasets[0]
363 VisitDetectorRegionRecordClass = self.
universe[
"visit_detector_region"].RecordClass
367 if firstDataset.obsInfo.visit_id
is not None:
368 exposure.records[
"visit_detector_region"] = []
370 for file
in exposure.files:
371 for dataset
in file.datasets:
372 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
373 raise ValueError(f
"Inconsistent visit/exposure relationship for "
374 f
"exposure {firstDataset.obsInfo.exposure_id} between "
375 f
"{file.filename} and {firstFile.filename}: "
376 f
"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
377 if dataset.region
is None:
378 self.log.warn(
"No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
379 dataset.obsInfo.detector_num)
381 visitVertices.extend(dataset.region.getVertices())
382 exposure.records[
"visit_detector_region"].append(
383 VisitDetectorRegionRecordClass.fromDict({
384 "instrument": dataset.obsInfo.instrument,
385 "visit": dataset.obsInfo.visit_id,
386 "detector": dataset.obsInfo.detector_num,
387 "region": dataset.region,
391 visitRegion = ConvexPolygon(visitVertices)
393 self.log.warn(
"No region found for visit=%s.", firstDataset.obsInfo.visit_id)
395 exposure.records[
"visit"] = [
401 """Expand the data IDs associated with a raw exposure to include
402 additional metadata records.
406 exposure : `RawExposureData`
407 A structure containing information about the exposure to be
408 ingested. Must have `RawExposureData.records` populated. Should
409 be considered consumed upon return.
413 exposure : `RawExposureData`
414 An updated version of the input structure, with
415 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
416 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
418 hasVisit =
"visit" in data.records
422 data.dataId = self.
butler.registry.expandDataId(
429 "exposure": data.records[
"exposure"][0],
430 "visit": data.records[
"visit"][0]
if hasVisit
else None,
437 vdrRecords = data.records[
"visit_detector_region"]
if hasVisit
else itertools.repeat(
None)
438 for file, vdrRecord
in zip(data.files, vdrRecords):
439 for dataset
in file.datasets:
440 dataset.dataId = self.
butler.registry.expandDataId(
442 records=dict(data.dataId.records, visit_detector_region=vdrRecord)
446 def prep(self, files, pool: Optional[Pool] =
None, processes: int = 1) -> Iterator[RawExposureData]:
447 """Perform all ingest preprocessing steps that do not involve actually
448 modifying the database.
452 files : iterable over `str` or path-like objects
453 Paths to the files to be ingested. Will be made absolute
454 if they are not already.
455 pool : `multiprocessing.Pool`, optional
456 If not `None`, a process pool with which to parallelize some
458 processes : `int`, optional
459 The number of processes to use. Ignored if ``pool`` is not `None`.
463 exposure : `RawExposureData`
464 Data structures containing dimension records, filenames, and data
465 IDs to be ingested (one structure for each exposure).
467 if pool
is None and processes > 1:
468 pool = Pool(processes)
469 mapFunc = map
if pool
is None else pool.imap_unordered
503 """Insert dimension records for one or more exposures.
507 records : `dict` mapping `str` to `list`
508 Dimension records to be inserted, organized as a mapping from
509 dimension name to a list of records for that dimension. This
510 may be a single `RawExposureData.records` dict, or an aggregate
511 for multiple exposures created by concatenating the value lists
512 of those dictionaries.
516 refs : `list` of `lsst.daf.butler.DatasetRef`
517 Dataset references for ingested raws.
526 for dimension
in (
"visit",
"exposure",
"visit_detector_region"):
527 recordsForDimension = records.get(dimension)
528 if recordsForDimension:
533 self.
butler.registry.insertDimensionData(dimension, *recordsForDimension)
536 ) -> List[DatasetRef]:
537 """Ingest all raw files in one exposure.
541 exposure : `RawExposureData`
542 A structure containing information about the exposure to be
543 ingested. Must have `RawExposureData.records` populated and all
544 data ID attributes expanded.
545 butler : `lsst.daf.butler.Butler`, optional
546 Butler to use for ingest. If not provided, ``self.butler`` will
551 refs : `list` of `lsst.daf.butler.DatasetRef`
552 Dataset references for ingested raws.
556 datasets = [FileDataset(path=os.path.abspath(file.filename),
557 refs=[DatasetRef(self.
datasetType, d.dataId)
for d
in file.datasets],
558 formatter=file.FormatterClass)
559 for file
in exposure.files]
560 butler.ingest(*datasets, transfer=self.config.transfer)
561 return [ref
for dataset
in datasets
for ref
in dataset.refs]
563 def run(self, files, pool: Optional[Pool] =
None, processes: int = 1):
564 """Ingest files into a Butler data repository.
566 This creates any new exposure or visit Dimension entries needed to
567 identify the ingested files, creates new Dataset entries in the
568 Registry and finally ingests the files themselves into the Datastore.
569 Any needed instrument, detector, and physical_filter Dimension entries
570 must exist in the Registry before `run` is called.
574 files : iterable over `str` or path-like objects
575 Paths to the files to be ingested. Will be made absolute
576 if they are not already.
577 pool : `multiprocessing.Pool`, optional
578 If not `None`, a process pool with which to parallelize some
580 processes : `int`, optional
581 The number of processes to use. Ignored if ``pool`` is not `None`.
585 refs : `list` of `lsst.daf.butler.DatasetRef`
586 Dataset references for ingested raws.
590 This method inserts all records (dimensions and datasets) for an
591 exposure within a transaction, guaranteeing that partial exposures
594 exposureData = self.
prep(files, pool=pool, processes=processes)
605 for exposure
in exposureData:
606 with self.
butler.transaction():