23 __all__ = (
"RawIngestTask",
"RawIngestConfig",
"makeTransferChoiceField")
27 from dataclasses
import dataclass
28 from typing
import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections
import defaultdict
30 from multiprocessing
import Pool
32 from astro_metadata_translator
import ObservationInfo, fix_header, merge_headers
34 from lsst.afw.fits
import readMetadata
35 from lsst.daf.butler
import (
43 from lsst.geom
import Box2D
44 from lsst.pex.config
import Config, Field, ChoiceField
45 from lsst.pipe.base
import Task
46 from lsst.sphgeom
import ConvexPolygon
48 from .instrument
import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
49 from .fitsRawFormatterBase
import FitsRawFormatterBase
54 """Structure that hold information about a single dataset within a 58 dataId: DataCoordinate
59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 65 obsInfo: ObservationInfo
66 """Standardized observation metadata extracted directly from the file 67 headers (`astro_metadata_translator.ObservationInfo`). 71 """Region on the sky covered by this file, possibly with padding 72 (`lsst.sphgeom.ConvexPolygon`). 78 """Structure that holds information about a single raw file, used during 82 datasets: List[RawFileDatasetInfo]
83 """The information describing each dataset within this raw file. 84 (`list` of `RawFileDatasetInfo`) 88 """Name of the file this information was extracted from (`str`). 90 This is the path prior to ingest, not the path after ingest. 93 FormatterClass: Type[FitsRawFormatterBase]
94 """Formatter class that should be used to ingest this file and compute 95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`). 101 """Structure that holds information about a complete raw exposure, used 105 dataId: DataCoordinate
106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 112 files: List[RawFileData]
113 """List of structures containing file-level information. 116 records: Optional[Dict[str, List[DimensionRecord]]] =
None 117 """Dictionary containing `DimensionRecord` instances that must be inserted 118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`). 120 Keys are the names of dimension elements ("exposure" and optionally "visit" 121 and "visit_detector_region"), while values are lists of `DimensionRecord`. 123 May be `None` during some ingest steps. 128 """Create a Config field with options for how to transfer files between 131 The allowed options for the field are exactly those supported by 132 `lsst.daf.butler.Datastore.ingest`. 137 Documentation for the configuration field. 141 field : `lsst.pex.config.ChoiceField` 147 allowed={
"move":
"move",
149 "hardlink":
"hard link",
150 "symlink":
"symbolic (soft) link"},
158 padRegionAmount = Field(
161 doc=
"Pad an image with specified number of pixels before calculating region" 164 doc=(
"Fully-qualified Python name of the `Instrument` subclass to " 165 "associate with all raws."),
173 """Driver Task for ingesting raw data into Gen3 Butler repositories. 175 This Task is intended to be runnable from the command-line, but it doesn't 176 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't 177 gain much from being one. It also wouldn't really be appropriate as a 178 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to 179 leverage the logging and configurability functionality that provides. 181 Each instance of `RawIngestTask` writes to the same Butler. Each 182 invocation of `RawIngestTask.run` ingests a list of files. 186 config : `RawIngestConfig` 187 Configuration for the task. 188 butler : `~lsst.daf.butler.Butler` 189 Butler instance. Ingested Datasets will be created as part of 190 ``butler.run`` and associated with its Collection. 192 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 195 Other keyword arguments are forwarded to the Task base class constructor. 198 ConfigClass = RawIngestConfig
200 _DefaultName =
"ingest" 203 """Return the DatasetType of the Datasets ingested by this Task. 205 return DatasetType(
"raw", (
"instrument",
"detector",
"exposure"),
"Exposure",
206 universe=self.
butler.registry.dimensions)
208 def __init__(self, config: Optional[RawIngestConfig] =
None, *, butler: Butler, **kwds: Any):
222 """Extract and process metadata from a single raw file. 232 A structure containing the metadata extracted from the file, 233 as well as the original filename. All fields will be populated, 234 but the `RawFileData.dataId` attribute will be a minimal 235 (unexpanded) `DataCoordinate` instance. 239 Assumes that there is a single dataset associated with the given 240 file. Instruments using a single file to store multiple datasets 241 must implement their own version of this method. 245 phdu = readMetadata(filename, 0)
246 header = merge_headers([phdu, readMetadata(filename)], mode=
"overwrite")
253 FormatterClass = self.
instrument.getRawFormatter(datasets[0].dataId)
255 return RawFileData(datasets=datasets, filename=filename,
256 FormatterClass=FormatterClass)
258 def _calculate_dataset_info(self, header, filename):
259 """Calculate a RawFileDatasetInfo from the supplied information. 264 Header from the dataset. 266 Filename to use for error messages. 270 dataset : `RawFileDatasetInfo` 271 The region, dataId, and observation information associated with 274 obsInfo = ObservationInfo(header)
275 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
276 exposure=obsInfo.exposure_id,
277 detector=obsInfo.detector_num,
279 if obsInfo.instrument != self.
instrument.getName():
280 raise ValueError(f
"Incorrect instrument (expected {self.instrument.getName()}, " 281 f
"got {obsInfo.instrument}) for file {filename}.")
283 FormatterClass = self.
instrument.getRawFormatter(dataId)
287 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
288 """Calculate the sky region covered by the supplied observation 293 obsInfo : `~astro_metadata_translator.ObservationInfo` 294 Summary information of this dataset. 296 Header from the dataset. 297 FormatterClass: `type` as subclass of `FitsRawFormatterBase` 298 Formatter class that should be used to compute the spatial region. 302 region : `lsst.sphgeom.ConvexPolygon` 303 Region of sky covered by this observation. 305 if obsInfo.visit_id
is not None and obsInfo.tracking_radec
is not None:
306 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
307 visitInfo = formatter.makeVisitInfo()
308 detector = self.
camera[obsInfo.detector_num]
309 wcs = formatter.makeWcs(visitInfo, detector)
310 pixBox = Box2D(detector.getBBox())
311 if self.config.padRegionAmount > 0:
312 pixBox.grow(self.config.padRegionAmount)
313 pixCorners = pixBox.getCorners()
314 sphCorners = [wcs.pixelToSky(point).getVector()
for point
in pixCorners]
315 region = ConvexPolygon(sphCorners)
321 """Group an iterable of `RawFileData` by exposure. 325 files : iterable of `RawFileData` 326 File-level information to group. 330 exposures : `list` of `RawExposureData` 331 A list of structures that group the file-level information by 332 exposure. The `RawExposureData.records` attributes of elements 333 will be `None`, but all other fields will be populated. The 334 `RawExposureData.dataId` attributes will be minimal (unexpanded) 335 `DataCoordinate` instances. 337 exposureDimensions = self.
universe[
"exposure"].graph
338 byExposure = defaultdict(list)
341 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
344 for dataId, exposureFiles
in byExposure.items()]
347 """Collect the `DimensionRecord` instances that must be inserted into 348 the `~lsst.daf.butler.Registry` before an exposure's raw files may be. 352 exposure : `RawExposureData` 353 A structure containing information about the exposure to be 354 ingested. Should be considered consumed upon return. 358 exposure : `RawExposureData` 359 An updated version of the input structure, with 360 `RawExposureData.records` populated. 362 firstFile = exposure.files[0]
363 firstDataset = firstFile.datasets[0]
364 VisitDetectorRegionRecordClass = self.
universe[
"visit_detector_region"].RecordClass
368 if firstDataset.obsInfo.visit_id
is not None:
369 exposure.records[
"visit_detector_region"] = []
371 for file
in exposure.files:
372 for dataset
in file.datasets:
373 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
374 raise ValueError(f
"Inconsistent visit/exposure relationship for " 375 f
"exposure {firstDataset.obsInfo.exposure_id} between " 376 f
"{file.filename} and {firstFile.filename}: " 377 f
"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
378 if dataset.region
is None:
379 self.log.warn(
"No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
380 dataset.obsInfo.detector_num)
382 visitVertices.extend(dataset.region.getVertices())
383 exposure.records[
"visit_detector_region"].append(
384 VisitDetectorRegionRecordClass.fromDict({
385 "instrument": dataset.obsInfo.instrument,
386 "visit": dataset.obsInfo.visit_id,
387 "detector": dataset.obsInfo.detector_num,
388 "region": dataset.region,
392 visitRegion = ConvexPolygon(visitVertices)
394 self.log.warn(
"No region found for visit=%s.", firstDataset.obsInfo.visit_id)
396 exposure.records[
"visit"] = [
402 """Expand the data IDs associated with a raw exposure to include 403 additional metadata records. 407 exposure : `RawExposureData` 408 A structure containing information about the exposure to be 409 ingested. Must have `RawExposureData.records` populated. Should 410 be considered consumed upon return. 414 exposure : `RawExposureData` 415 An updated version of the input structure, with 416 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 417 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances. 419 hasVisit =
"visit" in data.records
423 data.dataId = self.
butler.registry.expandDataId(
430 "exposure": data.records[
"exposure"][0],
431 "visit": data.records[
"visit"][0]
if hasVisit
else None,
438 vdrRecords = data.records[
"visit_detector_region"]
if hasVisit
else itertools.repeat(
None)
439 for file, vdrRecord
in zip(data.files, vdrRecords):
440 for dataset
in file.datasets:
441 dataset.dataId = self.
butler.registry.expandDataId(
443 records=dict(data.dataId.records, visit_detector_region=vdrRecord)
447 def prep(self, files, pool: Optional[Pool] =
None, processes: int = 1) -> Iterator[RawExposureData]:
448 """Perform all ingest preprocessing steps that do not involve actually 449 modifying the database. 453 files : iterable over `str` or path-like objects 454 Paths to the files to be ingested. Will be made absolute 455 if they are not already. 456 pool : `multiprocessing.Pool`, optional 457 If not `None`, a process pool with which to parallelize some 459 processes : `int`, optional 460 The number of processes to use. Ignored if ``pool`` is not `None`. 464 exposure : `RawExposureData` 465 Data structures containing dimension records, filenames, and data 466 IDs to be ingested (one structure for each exposure). 468 if pool
is None and processes > 1:
469 pool = Pool(processes)
470 mapFunc = map
if pool
is None else pool.imap_unordered
504 """Insert dimension records for one or more exposures. 508 records : `dict` mapping `str` to `list` 509 Dimension records to be inserted, organized as a mapping from 510 dimension name to a list of records for that dimension. This 511 may be a single `RawExposureData.records` dict, or an aggregate 512 for multiple exposures created by concatenating the value lists 513 of those dictionaries. 517 refs : `list` of `lsst.daf.butler.DatasetRef` 518 Dataset references for ingested raws. 527 for dimension
in (
"visit",
"exposure",
"visit_detector_region"):
528 recordsForDimension = records.get(dimension)
529 if recordsForDimension:
534 self.
butler.registry.insertDimensionData(dimension, *recordsForDimension)
537 ) -> List[DatasetRef]:
538 """Ingest all raw files in one exposure. 542 exposure : `RawExposureData` 543 A structure containing information about the exposure to be 544 ingested. Must have `RawExposureData.records` populated and all 545 data ID attributes expanded. 546 butler : `lsst.daf.butler.Butler`, optional 547 Butler to use for ingest. If not provided, ``self.butler`` will 552 refs : `list` of `lsst.daf.butler.DatasetRef` 553 Dataset references for ingested raws. 557 datasets = [FileDataset(path=os.path.abspath(file.filename),
558 refs=[DatasetRef(self.
datasetType, d.dataId)
for d
in file.datasets],
559 formatter=file.FormatterClass)
560 for file
in exposure.files]
561 butler.ingest(*datasets, transfer=self.config.transfer)
562 return [ref
for dataset
in datasets
for ref
in dataset.refs]
564 def run(self, files, pool: Optional[Pool] =
None, processes: int = 1):
565 """Ingest files into a Butler data repository. 567 This creates any new exposure or visit Dimension entries needed to 568 identify the ingested files, creates new Dataset entries in the 569 Registry and finally ingests the files themselves into the Datastore. 570 Any needed instrument, detector, and physical_filter Dimension entries 571 must exist in the Registry before `run` is called. 575 files : iterable over `str` or path-like objects 576 Paths to the files to be ingested. Will be made absolute 577 if they are not already. 578 pool : `multiprocessing.Pool`, optional 579 If not `None`, a process pool with which to parallelize some 581 processes : `int`, optional 582 The number of processes to use. Ignored if ``pool`` is not `None`. 586 refs : `list` of `lsst.daf.butler.DatasetRef` 587 Dataset references for ingested raws. 591 This method inserts all records (dimensions and datasets) for an 592 exposure within a transaction, guaranteeing that partial exposures 595 exposureData = self.
prep(files, pool=pool, processes=processes)
606 for exposure
in exposureData:
607 with self.
butler.transaction():
def ingestExposureDatasets
def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass)
def makeExposureRecordFromObsInfo(obsInfo, universe)
def _calculate_dataset_info(self, header, filename)
def makeVisitRecordFromObsInfo(obsInfo, universe, region=None)
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
def collectDimensionRecords