23 __all__ = (
"RawIngestTask",
"RawIngestConfig",
"makeTransferChoiceField")
27 from dataclasses
import dataclass
28 from typing
import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections
import defaultdict
30 from multiprocessing
import Pool
32 from astro_metadata_translator
import ObservationInfo, fix_header, merge_headers
34 from lsst.afw.fits
import readMetadata
35 from lsst.daf.butler
import (
43 from lsst.geom
import Box2D
44 from lsst.pex.config
import Config, Field, ChoiceField
45 from lsst.pipe.base
import Task
46 from lsst.sphgeom
import ConvexPolygon
48 from .instrument
import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
49 from .fitsRawFormatterBase
import FitsRawFormatterBase
54 """Structure that hold information about a single dataset within a 58 dataId: DataCoordinate
59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 65 obsInfo: ObservationInfo
66 """Standardized observation metadata extracted directly from the file 67 headers (`astro_metadata_translator.ObservationInfo`). 71 """Region on the sky covered by this file, possibly with padding 72 (`lsst.sphgeom.ConvexPolygon`). 78 """Structure that holds information about a single raw file, used during 82 datasets: List[RawFileDatasetInfo]
83 """The information describing each dataset within this raw file. 84 (`list` of `RawFileDatasetInfo`) 88 """Name of the file this information was extracted from (`str`). 90 This is the path prior to ingest, not the path after ingest. 93 FormatterClass: Type[FitsRawFormatterBase]
94 """Formatter class that should be used to ingest this file and compute 95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`). 101 """Structure that holds information about a complete raw exposure, used 105 dataId: DataCoordinate
106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 112 files: List[RawFileData]
113 """List of structures containing file-level information. 116 records: Optional[Dict[str, List[DimensionRecord]]] =
None 117 """Dictionary containing `DimensionRecord` instances that must be inserted 118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`). 120 Keys are the names of dimension elements ("exposure" and optionally "visit" 121 and "visit_detector_region"), while values are lists of `DimensionRecord`. 123 May be `None` during some ingest steps. 128 """Create a Config field with options for how to transfer files between 131 The allowed options for the field are exactly those supported by 132 `lsst.daf.butler.Datastore.ingest`. 137 Documentation for the configuration field. 141 field : `lsst.pex.config.ChoiceField` 147 allowed={
"move":
"move",
149 "auto":
"choice will depend on datastore",
150 "link":
"hard link falling back to symbolic link",
151 "hardlink":
"hard link",
152 "symlink":
"symbolic (soft) link",
153 "relsymlink":
"relative symbolic link",
162 padRegionAmount = Field(
165 doc=
"Pad an image with specified number of pixels before calculating region" 168 doc=(
"Fully-qualified Python name of the `Instrument` subclass to " 169 "associate with all raws."),
177 """Driver Task for ingesting raw data into Gen3 Butler repositories. 179 This Task is intended to be runnable from the command-line, but it doesn't 180 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't 181 gain much from being one. It also wouldn't really be appropriate as a 182 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to 183 leverage the logging and configurability functionality that provides. 185 Each instance of `RawIngestTask` writes to the same Butler. Each 186 invocation of `RawIngestTask.run` ingests a list of files. 190 config : `RawIngestConfig` 191 Configuration for the task. 192 butler : `~lsst.daf.butler.Butler` 193 Butler instance. Ingested Datasets will be created as part of 194 ``butler.run`` and associated with its Collection. 196 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 199 Other keyword arguments are forwarded to the Task base class constructor. 202 ConfigClass = RawIngestConfig
204 _DefaultName =
"ingest" 207 """Return the DatasetType of the Datasets ingested by this Task. 209 return DatasetType(
"raw", (
"instrument",
"detector",
"exposure"),
"Exposure",
210 universe=self.
butler.registry.dimensions)
212 def __init__(self, config: Optional[RawIngestConfig] =
None, *, butler: Butler, **kwds: Any):
226 """Extract and process metadata from a single raw file. 236 A structure containing the metadata extracted from the file, 237 as well as the original filename. All fields will be populated, 238 but the `RawFileData.dataId` attribute will be a minimal 239 (unexpanded) `DataCoordinate` instance. 243 Assumes that there is a single dataset associated with the given 244 file. Instruments using a single file to store multiple datasets 245 must implement their own version of this method. 249 phdu = readMetadata(filename, 0)
250 header = merge_headers([phdu, readMetadata(filename)], mode=
"overwrite")
257 FormatterClass = self.
instrument.getRawFormatter(datasets[0].dataId)
259 return RawFileData(datasets=datasets, filename=filename,
260 FormatterClass=FormatterClass)
262 def _calculate_dataset_info(self, header, filename):
263 """Calculate a RawFileDatasetInfo from the supplied information. 268 Header from the dataset. 270 Filename to use for error messages. 274 dataset : `RawFileDatasetInfo` 275 The region, dataId, and observation information associated with 278 obsInfo = ObservationInfo(header)
279 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
280 exposure=obsInfo.exposure_id,
281 detector=obsInfo.detector_num,
283 if obsInfo.instrument != self.
instrument.getName():
284 raise ValueError(f
"Incorrect instrument (expected {self.instrument.getName()}, " 285 f
"got {obsInfo.instrument}) for file {filename}.")
287 FormatterClass = self.
instrument.getRawFormatter(dataId)
291 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
292 """Calculate the sky region covered by the supplied observation 297 obsInfo : `~astro_metadata_translator.ObservationInfo` 298 Summary information of this dataset. 300 Header from the dataset. 301 FormatterClass: `type` as subclass of `FitsRawFormatterBase` 302 Formatter class that should be used to compute the spatial region. 306 region : `lsst.sphgeom.ConvexPolygon` 307 Region of sky covered by this observation. 309 if obsInfo.visit_id
is not None and obsInfo.tracking_radec
is not None:
310 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
311 visitInfo = formatter.makeVisitInfo()
312 detector = self.
camera[obsInfo.detector_num]
313 wcs = formatter.makeWcs(visitInfo, detector)
314 pixBox = Box2D(detector.getBBox())
315 if self.config.padRegionAmount > 0:
316 pixBox.grow(self.config.padRegionAmount)
317 pixCorners = pixBox.getCorners()
318 sphCorners = [wcs.pixelToSky(point).getVector()
for point
in pixCorners]
319 region = ConvexPolygon(sphCorners)
325 """Group an iterable of `RawFileData` by exposure. 329 files : iterable of `RawFileData` 330 File-level information to group. 334 exposures : `list` of `RawExposureData` 335 A list of structures that group the file-level information by 336 exposure. The `RawExposureData.records` attributes of elements 337 will be `None`, but all other fields will be populated. The 338 `RawExposureData.dataId` attributes will be minimal (unexpanded) 339 `DataCoordinate` instances. 341 exposureDimensions = self.
universe[
"exposure"].graph
342 byExposure = defaultdict(list)
345 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
348 for dataId, exposureFiles
in byExposure.items()]
351 """Collect the `DimensionRecord` instances that must be inserted into 352 the `~lsst.daf.butler.Registry` before an exposure's raw files may be. 356 exposure : `RawExposureData` 357 A structure containing information about the exposure to be 358 ingested. Should be considered consumed upon return. 362 exposure : `RawExposureData` 363 An updated version of the input structure, with 364 `RawExposureData.records` populated. 366 firstFile = exposure.files[0]
367 firstDataset = firstFile.datasets[0]
368 VisitDetectorRegionRecordClass = self.
universe[
"visit_detector_region"].RecordClass
372 if firstDataset.obsInfo.visit_id
is not None:
373 exposure.records[
"visit_detector_region"] = []
375 for file
in exposure.files:
376 for dataset
in file.datasets:
377 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
378 raise ValueError(f
"Inconsistent visit/exposure relationship for " 379 f
"exposure {firstDataset.obsInfo.exposure_id} between " 380 f
"{file.filename} and {firstFile.filename}: " 381 f
"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
382 if dataset.region
is None:
383 self.log.warn(
"No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
384 dataset.obsInfo.detector_num)
386 visitVertices.extend(dataset.region.getVertices())
387 exposure.records[
"visit_detector_region"].append(
388 VisitDetectorRegionRecordClass.fromDict({
389 "instrument": dataset.obsInfo.instrument,
390 "visit": dataset.obsInfo.visit_id,
391 "detector": dataset.obsInfo.detector_num,
392 "region": dataset.region,
396 visitRegion = ConvexPolygon(visitVertices)
398 self.log.warn(
"No region found for visit=%s.", firstDataset.obsInfo.visit_id)
400 exposure.records[
"visit"] = [
406 """Expand the data IDs associated with a raw exposure to include 407 additional metadata records. 411 exposure : `RawExposureData` 412 A structure containing information about the exposure to be 413 ingested. Must have `RawExposureData.records` populated. Should 414 be considered consumed upon return. 418 exposure : `RawExposureData` 419 An updated version of the input structure, with 420 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 421 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances. 423 hasVisit =
"visit" in data.records
427 data.dataId = self.
butler.registry.expandDataId(
434 "exposure": data.records[
"exposure"][0],
435 "visit": data.records[
"visit"][0]
if hasVisit
else None,
442 vdrRecords = data.records[
"visit_detector_region"]
if hasVisit
else itertools.repeat(
None)
443 for file, vdrRecord
in zip(data.files, vdrRecords):
444 for dataset
in file.datasets:
445 dataset.dataId = self.
butler.registry.expandDataId(
447 records=dict(data.dataId.records, visit_detector_region=vdrRecord)
451 def prep(self, files, pool: Optional[Pool] =
None, processes: int = 1) -> Iterator[RawExposureData]:
452 """Perform all ingest preprocessing steps that do not involve actually 453 modifying the database. 457 files : iterable over `str` or path-like objects 458 Paths to the files to be ingested. Will be made absolute 459 if they are not already. 460 pool : `multiprocessing.Pool`, optional 461 If not `None`, a process pool with which to parallelize some 463 processes : `int`, optional 464 The number of processes to use. Ignored if ``pool`` is not `None`. 468 exposure : `RawExposureData` 469 Data structures containing dimension records, filenames, and data 470 IDs to be ingested (one structure for each exposure). 472 if pool
is None and processes > 1:
473 pool = Pool(processes)
474 mapFunc = map
if pool
is None else pool.imap_unordered
508 """Insert dimension records for one or more exposures. 512 records : `dict` mapping `str` to `list` 513 Dimension records to be inserted, organized as a mapping from 514 dimension name to a list of records for that dimension. This 515 may be a single `RawExposureData.records` dict, or an aggregate 516 for multiple exposures created by concatenating the value lists 517 of those dictionaries. 521 refs : `list` of `lsst.daf.butler.DatasetRef` 522 Dataset references for ingested raws. 531 for dimension
in (
"visit",
"exposure",
"visit_detector_region"):
532 recordsForDimension = records.get(dimension)
533 if recordsForDimension:
538 self.
butler.registry.insertDimensionData(dimension, *recordsForDimension)
541 ) -> List[DatasetRef]:
542 """Ingest all raw files in one exposure. 546 exposure : `RawExposureData` 547 A structure containing information about the exposure to be 548 ingested. Must have `RawExposureData.records` populated and all 549 data ID attributes expanded. 550 butler : `lsst.daf.butler.Butler`, optional 551 Butler to use for ingest. If not provided, ``self.butler`` will 556 refs : `list` of `lsst.daf.butler.DatasetRef` 557 Dataset references for ingested raws. 561 datasets = [FileDataset(path=os.path.abspath(file.filename),
562 refs=[DatasetRef(self.
datasetType, d.dataId)
for d
in file.datasets],
563 formatter=file.FormatterClass)
564 for file
in exposure.files]
565 butler.ingest(*datasets, transfer=self.config.transfer)
566 return [ref
for dataset
in datasets
for ref
in dataset.refs]
568 def run(self, files, pool: Optional[Pool] =
None, processes: int = 1):
569 """Ingest files into a Butler data repository. 571 This creates any new exposure or visit Dimension entries needed to 572 identify the ingested files, creates new Dataset entries in the 573 Registry and finally ingests the files themselves into the Datastore. 574 Any needed instrument, detector, and physical_filter Dimension entries 575 must exist in the Registry before `run` is called. 579 files : iterable over `str` or path-like objects 580 Paths to the files to be ingested. Will be made absolute 581 if they are not already. 582 pool : `multiprocessing.Pool`, optional 583 If not `None`, a process pool with which to parallelize some 585 processes : `int`, optional 586 The number of processes to use. Ignored if ``pool`` is not `None`. 590 refs : `list` of `lsst.daf.butler.DatasetRef` 591 Dataset references for ingested raws. 595 This method inserts all records (dimensions and datasets) for an 596 exposure within a transaction, guaranteeing that partial exposures 599 exposureData = self.
prep(files, pool=pool, processes=processes)
610 for exposure
in exposureData:
611 with self.
butler.transaction():
def ingestExposureDatasets
def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass)
def makeExposureRecordFromObsInfo(obsInfo, universe)
def _calculate_dataset_info(self, header, filename)
def makeVisitRecordFromObsInfo(obsInfo, universe, region=None)
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
def collectDimensionRecords