23 __all__ = (
"RawIngestTask",
"RawIngestConfig",
"makeTransferChoiceField")
27 from dataclasses
import dataclass
28 from typing
import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29 from collections
import defaultdict
30 from multiprocessing
import Pool
32 from astro_metadata_translator
import ObservationInfo, fix_header, merge_headers
34 from lsst.afw.fits
import readMetadata
35 from lsst.daf.butler
import (
43 from lsst.geom
import Box2D
44 from lsst.pex.config
import Config, Field, ChoiceField
45 from lsst.pipe.base
import Task
46 from lsst.sphgeom
import ConvexPolygon
48 from .instrument
import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
49 from .fitsRawFormatterBase
import FitsRawFormatterBase
54 """Structure that hold information about a single dataset within a 58 dataId: DataCoordinate
59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 65 obsInfo: ObservationInfo
66 """Standardized observation metadata extracted directly from the file 67 headers (`astro_metadata_translator.ObservationInfo`). 71 """Region on the sky covered by this file, possibly with padding 72 (`lsst.sphgeom.ConvexPolygon`). 78 """Structure that holds information about a single raw file, used during 82 datasets: List[RawFileDatasetInfo]
83 """The information describing each dataset within this raw file. 84 (`list` of `RawFileDatasetInfo`) 88 """Name of the file this information was extracted from (`str`). 90 This is the path prior to ingest, not the path after ingest. 93 FormatterClass: Type[FitsRawFormatterBase]
94 """Formatter class that should be used to ingest this file and compute 95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`). 101 """Structure that holds information about a complete raw exposure, used 105 dataId: DataCoordinate
106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 112 files: List[RawFileData]
113 """List of structures containing file-level information. 116 records: Optional[Dict[str, List[DimensionRecord]]] =
None 117 """Dictionary containing `DimensionRecord` instances that must be inserted 118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`). 120 Keys are the names of dimension elements ("exposure" and optionally "visit" 121 and "visit_detector_region"), while values are lists of `DimensionRecord`. 123 May be `None` during some ingest steps. 128 """Create a Config field with options for how to transfer files between 131 The allowed options for the field are exactly those supported by 132 `lsst.daf.butler.Datastore.ingest`. 137 Documentation for the configuration field. 141 field : `lsst.pex.config.ChoiceField` 147 allowed={
"move":
"move",
149 "auto":
"choice will depend on datastore",
150 "link":
"hard link falling back to symbolic link",
151 "hardlink":
"hard link",
152 "symlink":
"symbolic (soft) link"},
160 padRegionAmount = Field(
163 doc=
"Pad an image with specified number of pixels before calculating region" 166 doc=(
"Fully-qualified Python name of the `Instrument` subclass to " 167 "associate with all raws."),
175 """Driver Task for ingesting raw data into Gen3 Butler repositories. 177 This Task is intended to be runnable from the command-line, but it doesn't 178 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't 179 gain much from being one. It also wouldn't really be appropriate as a 180 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to 181 leverage the logging and configurability functionality that provides. 183 Each instance of `RawIngestTask` writes to the same Butler. Each 184 invocation of `RawIngestTask.run` ingests a list of files. 188 config : `RawIngestConfig` 189 Configuration for the task. 190 butler : `~lsst.daf.butler.Butler` 191 Butler instance. Ingested Datasets will be created as part of 192 ``butler.run`` and associated with its Collection. 194 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 197 Other keyword arguments are forwarded to the Task base class constructor. 200 ConfigClass = RawIngestConfig
202 _DefaultName =
"ingest" 205 """Return the DatasetType of the Datasets ingested by this Task. 207 return DatasetType(
"raw", (
"instrument",
"detector",
"exposure"),
"Exposure",
208 universe=self.
butler.registry.dimensions)
210 def __init__(self, config: Optional[RawIngestConfig] =
None, *, butler: Butler, **kwds: Any):
224 """Extract and process metadata from a single raw file. 234 A structure containing the metadata extracted from the file, 235 as well as the original filename. All fields will be populated, 236 but the `RawFileData.dataId` attribute will be a minimal 237 (unexpanded) `DataCoordinate` instance. 241 Assumes that there is a single dataset associated with the given 242 file. Instruments using a single file to store multiple datasets 243 must implement their own version of this method. 247 phdu = readMetadata(filename, 0)
248 header = merge_headers([phdu, readMetadata(filename)], mode=
"overwrite")
255 FormatterClass = self.
instrument.getRawFormatter(datasets[0].dataId)
257 return RawFileData(datasets=datasets, filename=filename,
258 FormatterClass=FormatterClass)
260 def _calculate_dataset_info(self, header, filename):
261 """Calculate a RawFileDatasetInfo from the supplied information. 266 Header from the dataset. 268 Filename to use for error messages. 272 dataset : `RawFileDatasetInfo` 273 The region, dataId, and observation information associated with 276 obsInfo = ObservationInfo(header)
277 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
278 exposure=obsInfo.exposure_id,
279 detector=obsInfo.detector_num,
281 if obsInfo.instrument != self.
instrument.getName():
282 raise ValueError(f
"Incorrect instrument (expected {self.instrument.getName()}, " 283 f
"got {obsInfo.instrument}) for file {filename}.")
285 FormatterClass = self.
instrument.getRawFormatter(dataId)
289 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
290 """Calculate the sky region covered by the supplied observation 295 obsInfo : `~astro_metadata_translator.ObservationInfo` 296 Summary information of this dataset. 298 Header from the dataset. 299 FormatterClass: `type` as subclass of `FitsRawFormatterBase` 300 Formatter class that should be used to compute the spatial region. 304 region : `lsst.sphgeom.ConvexPolygon` 305 Region of sky covered by this observation. 307 if obsInfo.visit_id
is not None and obsInfo.tracking_radec
is not None:
308 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
309 visitInfo = formatter.makeVisitInfo()
310 detector = self.
camera[obsInfo.detector_num]
311 wcs = formatter.makeWcs(visitInfo, detector)
312 pixBox = Box2D(detector.getBBox())
313 if self.config.padRegionAmount > 0:
314 pixBox.grow(self.config.padRegionAmount)
315 pixCorners = pixBox.getCorners()
316 sphCorners = [wcs.pixelToSky(point).getVector()
for point
in pixCorners]
317 region = ConvexPolygon(sphCorners)
323 """Group an iterable of `RawFileData` by exposure. 327 files : iterable of `RawFileData` 328 File-level information to group. 332 exposures : `list` of `RawExposureData` 333 A list of structures that group the file-level information by 334 exposure. The `RawExposureData.records` attributes of elements 335 will be `None`, but all other fields will be populated. The 336 `RawExposureData.dataId` attributes will be minimal (unexpanded) 337 `DataCoordinate` instances. 339 exposureDimensions = self.
universe[
"exposure"].graph
340 byExposure = defaultdict(list)
343 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
346 for dataId, exposureFiles
in byExposure.items()]
349 """Collect the `DimensionRecord` instances that must be inserted into 350 the `~lsst.daf.butler.Registry` before an exposure's raw files may be. 354 exposure : `RawExposureData` 355 A structure containing information about the exposure to be 356 ingested. Should be considered consumed upon return. 360 exposure : `RawExposureData` 361 An updated version of the input structure, with 362 `RawExposureData.records` populated. 364 firstFile = exposure.files[0]
365 firstDataset = firstFile.datasets[0]
366 VisitDetectorRegionRecordClass = self.
universe[
"visit_detector_region"].RecordClass
370 if firstDataset.obsInfo.visit_id
is not None:
371 exposure.records[
"visit_detector_region"] = []
373 for file
in exposure.files:
374 for dataset
in file.datasets:
375 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
376 raise ValueError(f
"Inconsistent visit/exposure relationship for " 377 f
"exposure {firstDataset.obsInfo.exposure_id} between " 378 f
"{file.filename} and {firstFile.filename}: " 379 f
"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
380 if dataset.region
is None:
381 self.log.warn(
"No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
382 dataset.obsInfo.detector_num)
384 visitVertices.extend(dataset.region.getVertices())
385 exposure.records[
"visit_detector_region"].append(
386 VisitDetectorRegionRecordClass.fromDict({
387 "instrument": dataset.obsInfo.instrument,
388 "visit": dataset.obsInfo.visit_id,
389 "detector": dataset.obsInfo.detector_num,
390 "region": dataset.region,
394 visitRegion = ConvexPolygon(visitVertices)
396 self.log.warn(
"No region found for visit=%s.", firstDataset.obsInfo.visit_id)
398 exposure.records[
"visit"] = [
404 """Expand the data IDs associated with a raw exposure to include 405 additional metadata records. 409 exposure : `RawExposureData` 410 A structure containing information about the exposure to be 411 ingested. Must have `RawExposureData.records` populated. Should 412 be considered consumed upon return. 416 exposure : `RawExposureData` 417 An updated version of the input structure, with 418 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 419 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances. 421 hasVisit =
"visit" in data.records
425 data.dataId = self.
butler.registry.expandDataId(
432 "exposure": data.records[
"exposure"][0],
433 "visit": data.records[
"visit"][0]
if hasVisit
else None,
440 vdrRecords = data.records[
"visit_detector_region"]
if hasVisit
else itertools.repeat(
None)
441 for file, vdrRecord
in zip(data.files, vdrRecords):
442 for dataset
in file.datasets:
443 dataset.dataId = self.
butler.registry.expandDataId(
445 records=dict(data.dataId.records, visit_detector_region=vdrRecord)
449 def prep(self, files, pool: Optional[Pool] =
None, processes: int = 1) -> Iterator[RawExposureData]:
450 """Perform all ingest preprocessing steps that do not involve actually 451 modifying the database. 455 files : iterable over `str` or path-like objects 456 Paths to the files to be ingested. Will be made absolute 457 if they are not already. 458 pool : `multiprocessing.Pool`, optional 459 If not `None`, a process pool with which to parallelize some 461 processes : `int`, optional 462 The number of processes to use. Ignored if ``pool`` is not `None`. 466 exposure : `RawExposureData` 467 Data structures containing dimension records, filenames, and data 468 IDs to be ingested (one structure for each exposure). 470 if pool
is None and processes > 1:
471 pool = Pool(processes)
472 mapFunc = map
if pool
is None else pool.imap_unordered
506 """Insert dimension records for one or more exposures. 510 records : `dict` mapping `str` to `list` 511 Dimension records to be inserted, organized as a mapping from 512 dimension name to a list of records for that dimension. This 513 may be a single `RawExposureData.records` dict, or an aggregate 514 for multiple exposures created by concatenating the value lists 515 of those dictionaries. 519 refs : `list` of `lsst.daf.butler.DatasetRef` 520 Dataset references for ingested raws. 529 for dimension
in (
"visit",
"exposure",
"visit_detector_region"):
530 recordsForDimension = records.get(dimension)
531 if recordsForDimension:
536 self.
butler.registry.insertDimensionData(dimension, *recordsForDimension)
539 ) -> List[DatasetRef]:
540 """Ingest all raw files in one exposure. 544 exposure : `RawExposureData` 545 A structure containing information about the exposure to be 546 ingested. Must have `RawExposureData.records` populated and all 547 data ID attributes expanded. 548 butler : `lsst.daf.butler.Butler`, optional 549 Butler to use for ingest. If not provided, ``self.butler`` will 554 refs : `list` of `lsst.daf.butler.DatasetRef` 555 Dataset references for ingested raws. 559 datasets = [FileDataset(path=os.path.abspath(file.filename),
560 refs=[DatasetRef(self.
datasetType, d.dataId)
for d
in file.datasets],
561 formatter=file.FormatterClass)
562 for file
in exposure.files]
563 butler.ingest(*datasets, transfer=self.config.transfer)
564 return [ref
for dataset
in datasets
for ref
in dataset.refs]
566 def run(self, files, pool: Optional[Pool] =
None, processes: int = 1):
567 """Ingest files into a Butler data repository. 569 This creates any new exposure or visit Dimension entries needed to 570 identify the ingested files, creates new Dataset entries in the 571 Registry and finally ingests the files themselves into the Datastore. 572 Any needed instrument, detector, and physical_filter Dimension entries 573 must exist in the Registry before `run` is called. 577 files : iterable over `str` or path-like objects 578 Paths to the files to be ingested. Will be made absolute 579 if they are not already. 580 pool : `multiprocessing.Pool`, optional 581 If not `None`, a process pool with which to parallelize some 583 processes : `int`, optional 584 The number of processes to use. Ignored if ``pool`` is not `None`. 588 refs : `list` of `lsst.daf.butler.DatasetRef` 589 Dataset references for ingested raws. 593 This method inserts all records (dimensions and datasets) for an 594 exposure within a transaction, guaranteeing that partial exposures 597 exposureData = self.
prep(files, pool=pool, processes=processes)
608 for exposure
in exposureData:
609 with self.
butler.transaction():
def ingestExposureDatasets
def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass)
def makeExposureRecordFromObsInfo(obsInfo, universe)
def _calculate_dataset_info(self, header, filename)
def makeVisitRecordFromObsInfo(obsInfo, universe, region=None)
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
def collectDimensionRecords