23 __all__ = (
"RawIngestTask",
"RawIngestConfig",
"makeTransferChoiceField")
26 from dataclasses
import dataclass, InitVar
27 from typing
import List, Iterator, Iterable, Type, Optional, Any
28 from collections
import defaultdict
29 from multiprocessing
import Pool
31 from astro_metadata_translator
import ObservationInfo, fix_header, merge_headers
32 from lsst.afw.fits
import readMetadata
33 from lsst.daf.butler
import (
43 from lsst.pex.config
import Config, ChoiceField
44 from lsst.pipe.base
import Task
46 from ._instrument
import Instrument, makeExposureRecordFromObsInfo
47 from ._fitsRawFormatterBase
import FitsRawFormatterBase
52 """Structure that holds information about a single dataset within a
56 dataId: DataCoordinate
57 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
59 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
60 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63 obsInfo: ObservationInfo
64 """Standardized observation metadata extracted directly from the file
65 headers (`astro_metadata_translator.ObservationInfo`).
71 """Structure that holds information about a single raw file, used during
75 datasets: List[RawFileDatasetInfo]
76 """The information describing each dataset within this raw file.
77 (`list` of `RawFileDatasetInfo`)
81 """Name of the file this information was extracted from (`str`).
83 This is the path prior to ingest, not the path after ingest.
86 FormatterClass: Type[FitsRawFormatterBase]
87 """Formatter class that should be used to ingest this file (`type`; as
88 subclass of `FitsRawFormatterBase`).
91 instrumentClass: Type[Instrument]
92 """The `Instrument` class associated with this file."""
97 """Structure that holds information about a complete raw exposure, used
101 dataId: DataCoordinate
102 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
104 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
105 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
108 files: List[RawFileData]
109 """List of structures containing file-level information.
112 universe: InitVar[DimensionUniverse]
113 """Set of all known dimensions.
116 record: Optional[DimensionRecord] =
None
117 """The exposure `DimensionRecord` that must be inserted into the
118 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
128 """Create a Config field with options for how to transfer files between
131 The allowed options for the field are exactly those supported by
132 `lsst.daf.butler.Datastore.ingest`.
137 Documentation for the configuration field.
141 field : `lsst.pex.config.ChoiceField`
147 allowed={
"move":
"move",
149 "auto":
"choice will depend on datastore",
150 "link":
"hard link falling back to symbolic link",
151 "hardlink":
"hard link",
152 "symlink":
"symbolic (soft) link",
153 "relsymlink":
"relative symbolic link",
165 """Driver Task for ingesting raw data into Gen3 Butler repositories.
169 config : `RawIngestConfig`
170 Configuration for the task.
171 butler : `~lsst.daf.butler.Butler`
172 Writeable butler instance, with ``butler.run`` set to the appropriate
173 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
176 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
181 Each instance of `RawIngestTask` writes to the same Butler. Each
182 invocation of `RawIngestTask.run` ingests a list of files.
185 ConfigClass = RawIngestConfig
187 _DefaultName =
"ingest"
190 """Return the DatasetType of the datasets ingested by this Task.
192 return DatasetType(
"raw", (
"instrument",
"detector",
"exposure"),
"Exposure",
193 universe=self.
butler.registry.dimensions)
195 def __init__(self, config: Optional[RawIngestConfig] =
None, *, butler: Butler, **kwargs: Any):
204 Instrument.importAll(self.
butler.registry)
207 """Extract and process metadata from a single raw file.
217 A structure containing the metadata extracted from the file,
218 as well as the original filename. All fields will be populated,
219 but the `RawFileData.dataId` attribute will be a minimal
220 (unexpanded) `DataCoordinate` instance.
224 Assumes that there is a single dataset associated with the given
225 file. Instruments using a single file to store multiple datasets
226 must implement their own version of this method.
230 phdu = readMetadata(filename, 0)
231 header = merge_headers([phdu, readMetadata(filename)], mode=
"overwrite")
238 instrument = Instrument.fromName(datasets[0].dataId[
"instrument"], self.
butler.registry)
239 FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
241 return RawFileData(datasets=datasets, filename=filename,
242 FormatterClass=FormatterClass,
243 instrumentClass=instrument)
245 def _calculate_dataset_info(self, header, filename):
246 """Calculate a RawFileDatasetInfo from the supplied information.
251 Header from the dataset.
253 Filename to use for error messages.
257 dataset : `RawFileDatasetInfo`
258 The dataId, and observation information associated with this
261 obsInfo = ObservationInfo(header)
262 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
263 exposure=obsInfo.exposure_id,
264 detector=obsInfo.detector_num,
269 """Group an iterable of `RawFileData` by exposure.
273 files : iterable of `RawFileData`
274 File-level information to group.
278 exposures : `list` of `RawExposureData`
279 A list of structures that group the file-level information by
280 exposure. All fields will be populated. The
281 `RawExposureData.dataId` attributes will be minimal (unexpanded)
282 `DataCoordinate` instances.
284 exposureDimensions = self.
universe[
"exposure"].graph
285 byExposure = defaultdict(list)
288 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
291 for dataId, exposureFiles
in byExposure.items()]
294 """Expand the data IDs associated with a raw exposure to include
295 additional metadata records.
299 exposure : `RawExposureData`
300 A structure containing information about the exposure to be
301 ingested. Must have `RawExposureData.records` populated. Should
302 be considered consumed upon return.
306 exposure : `RawExposureData`
307 An updated version of the input structure, with
308 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
309 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
314 data.dataId = self.
butler.registry.expandDataId(
321 self.
butler.registry.dimensions[
"exposure"]: data.record,
327 for file
in data.files:
328 for dataset
in file.datasets:
329 dataset.dataId = self.
butler.registry.expandDataId(
331 records=dict(data.dataId.records)
335 def prep(self, files, *, pool: Optional[Pool] =
None, processes: int = 1) -> Iterator[RawExposureData]:
336 """Perform all ingest preprocessing steps that do not involve actually
337 modifying the database.
341 files : iterable over `str` or path-like objects
342 Paths to the files to be ingested. Will be made absolute
343 if they are not already.
344 pool : `multiprocessing.Pool`, optional
345 If not `None`, a process pool with which to parallelize some
347 processes : `int`, optional
348 The number of processes to use. Ignored if ``pool`` is not `None`.
352 exposure : `RawExposureData`
353 Data structures containing dimension records, filenames, and data
354 IDs to be ingested (one structure for each exposure).
356 if pool
is None and processes > 1:
357 pool = Pool(processes)
358 mapFunc = map
if pool
is None else pool.imap_unordered
386 ) -> List[DatasetRef]:
387 """Ingest all raw files in one exposure.
391 exposure : `RawExposureData`
392 A structure containing information about the exposure to be
393 ingested. Must have `RawExposureData.records` populated and all
394 data ID attributes expanded.
395 run : `str`, optional
396 Name of a RUN-type collection to write to, overriding
401 refs : `list` of `lsst.daf.butler.DatasetRef`
402 Dataset references for ingested raws.
404 datasets = [FileDataset(path=os.path.abspath(file.filename),
405 refs=[DatasetRef(self.
datasetType, d.dataId)
for d
in file.datasets],
406 formatter=file.FormatterClass)
407 for file
in exposure.files]
408 self.
butler.ingest(*datasets, transfer=self.config.transfer, run=run)
409 return [ref
for dataset
in datasets
for ref
in dataset.refs]
411 def run(self, files, *, pool: Optional[Pool] =
None, processes: int = 1, run: Optional[str] =
None):
412 """Ingest files into a Butler data repository.
414 This creates any new exposure or visit Dimension entries needed to
415 identify the ingested files, creates new Dataset entries in the
416 Registry and finally ingests the files themselves into the Datastore.
417 Any needed instrument, detector, and physical_filter Dimension entries
418 must exist in the Registry before `run` is called.
422 files : iterable over `str` or path-like objects
423 Paths to the files to be ingested. Will be made absolute
424 if they are not already.
425 pool : `multiprocessing.Pool`, optional
426 If not `None`, a process pool with which to parallelize some
428 processes : `int`, optional
429 The number of processes to use. Ignored if ``pool`` is not `None`.
430 run : `str`, optional
431 Name of a RUN-type collection to write to, overriding
432 the default derived from the instrument name.
436 refs : `list` of `lsst.daf.butler.DatasetRef`
437 Dataset references for ingested raws.
441 This method inserts all datasets for an exposure within a transaction,
442 guaranteeing that partial exposures are never ingested. The exposure
443 dimension record is inserted with `Registry.syncDimensionData` first
444 (in its own transaction), which inserts only if a record with the same
445 primary key does not already exist. This allows different files within
446 the same exposure to be incremented in different runs.
448 exposureData = self.
prep(files, pool=pool, processes=processes)
460 for exposure
in exposureData:
461 self.
butler.registry.syncDimensionData(
"exposure", exposure.record)
464 instrumentClass = exposure.files[0].instrumentClass
465 this_run = instrumentClass.makeDefaultRawIngestRunName()
468 if this_run
not in runs:
469 self.
butler.registry.registerCollection(this_run, type=CollectionType.RUN)
471 with self.
butler.transaction():