23 __all__ = (
"RawIngestTask",
"RawIngestConfig",
"makeTransferChoiceField")
26 from dataclasses
import dataclass, InitVar
27 from typing
import List, Iterator, Iterable, Type, Optional, Any
28 from collections
import defaultdict
29 from multiprocessing
import Pool
31 from astro_metadata_translator
import ObservationInfo, fix_header, merge_headers
32 from lsst.afw.fits
import readMetadata
33 from lsst.daf.butler
import (
43 from lsst.pex.config
import Config, ChoiceField
44 from lsst.pipe.base
import Task
46 from ._instrument
import Instrument, makeExposureRecordFromObsInfo
47 from ._fitsRawFormatterBase
import FitsRawFormatterBase
52 """Structure that holds information about a single dataset within a
56 dataId: DataCoordinate
57 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
60 obsInfo: ObservationInfo
61 """Standardized observation metadata extracted directly from the file
62 headers (`astro_metadata_translator.ObservationInfo`).
68 """Structure that holds information about a single raw file, used during
72 datasets: List[RawFileDatasetInfo]
73 """The information describing each dataset within this raw file.
74 (`list` of `RawFileDatasetInfo`)
78 """Name of the file this information was extracted from (`str`).
80 This is the path prior to ingest, not the path after ingest.
83 FormatterClass: Type[FitsRawFormatterBase]
84 """Formatter class that should be used to ingest this file (`type`; as
85 subclass of `FitsRawFormatterBase`).
88 instrumentClass: Type[Instrument]
89 """The `Instrument` class associated with this file."""
94 """Structure that holds information about a complete raw exposure, used
98 dataId: DataCoordinate
99 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
102 files: List[RawFileData]
103 """List of structures containing file-level information.
106 universe: InitVar[DimensionUniverse]
107 """Set of all known dimensions.
110 record: Optional[DimensionRecord] =
None
111 """The exposure `DimensionRecord` that must be inserted into the
112 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
122 """Create a Config field with options for how to transfer files between
125 The allowed options for the field are exactly those supported by
126 `lsst.daf.butler.Datastore.ingest`.
131 Documentation for the configuration field.
135 field : `lsst.pex.config.ChoiceField`
141 allowed={
"move":
"move",
143 "auto":
"choice will depend on datastore",
144 "link":
"hard link falling back to symbolic link",
145 "hardlink":
"hard link",
146 "symlink":
"symbolic (soft) link",
147 "relsymlink":
"relative symbolic link",
159 """Driver Task for ingesting raw data into Gen3 Butler repositories.
163 config : `RawIngestConfig`
164 Configuration for the task.
165 butler : `~lsst.daf.butler.Butler`
166 Writeable butler instance, with ``butler.run`` set to the appropriate
167 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
170 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
175 Each instance of `RawIngestTask` writes to the same Butler. Each
176 invocation of `RawIngestTask.run` ingests a list of files.
179 ConfigClass = RawIngestConfig
181 _DefaultName =
"ingest"
184 """Return the DatasetType of the datasets ingested by this Task.
186 return DatasetType(
"raw", (
"instrument",
"detector",
"exposure"),
"Exposure",
187 universe=self.
butler.registry.dimensions)
189 def __init__(self, config: Optional[RawIngestConfig] =
None, *, butler: Butler, **kwargs: Any):
198 Instrument.importAll(self.
butler.registry)
203 def _makeTask(cls, config: RawIngestConfig, butler: Butler, name: str, parentTask: Task):
204 """Construct a RawIngestTask using only positional arguments.
208 All parameters are as for `RawIngestTask`.
210 return cls(config=config, butler=butler, name=name, parentTask=parentTask)
214 return (self.
_makeTask, (self.config, self.
butler, self._name, self._parentTask))
217 """Extract and process metadata from a single raw file.
227 A structure containing the metadata extracted from the file,
228 as well as the original filename. All fields will be populated,
229 but the `RawFileData.dataId` attribute will be a minimal
230 (unexpanded) `DataCoordinate` instance.
234 Assumes that there is a single dataset associated with the given
235 file. Instruments using a single file to store multiple datasets
236 must implement their own version of this method.
240 phdu = readMetadata(filename, 0)
241 header = merge_headers([phdu, readMetadata(filename)], mode=
"overwrite")
248 instrument = Instrument.fromName(datasets[0].dataId[
"instrument"], self.
butler.registry)
249 FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
251 return RawFileData(datasets=datasets, filename=filename,
252 FormatterClass=FormatterClass,
253 instrumentClass=instrument)
255 def _calculate_dataset_info(self, header, filename):
256 """Calculate a RawFileDatasetInfo from the supplied information.
261 Header from the dataset.
263 Filename to use for error messages.
267 dataset : `RawFileDatasetInfo`
268 The dataId, and observation information associated with this
271 obsInfo = ObservationInfo(header)
272 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
273 exposure=obsInfo.exposure_id,
274 detector=obsInfo.detector_num,
279 """Group an iterable of `RawFileData` by exposure.
283 files : iterable of `RawFileData`
284 File-level information to group.
288 exposures : `list` of `RawExposureData`
289 A list of structures that group the file-level information by
290 exposure. All fields will be populated. The
291 `RawExposureData.dataId` attributes will be minimal (unexpanded)
292 `DataCoordinate` instances.
294 exposureDimensions = self.
universe[
"exposure"].graph
295 byExposure = defaultdict(list)
298 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
301 for dataId, exposureFiles
in byExposure.items()]
304 """Expand the data IDs associated with a raw exposure to include
305 additional metadata records.
309 exposure : `RawExposureData`
310 A structure containing information about the exposure to be
311 ingested. Must have `RawExposureData.records` populated. Should
312 be considered consumed upon return.
316 exposure : `RawExposureData`
317 An updated version of the input structure, with
318 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
319 updated to data IDs for which `DataCoordinate.hasRecords` returns
325 data.dataId = self.
butler.registry.expandDataId(
332 self.
butler.registry.dimensions[
"exposure"]: data.record,
338 for file
in data.files:
339 for dataset
in file.datasets:
340 dataset.dataId = self.
butler.registry.expandDataId(
342 records=dict(data.dataId.records)
346 def prep(self, files, *, pool: Optional[Pool] =
None, processes: int = 1) -> Iterator[RawExposureData]:
347 """Perform all ingest preprocessing steps that do not involve actually
348 modifying the database.
352 files : iterable over `str` or path-like objects
353 Paths to the files to be ingested. Will be made absolute
354 if they are not already.
355 pool : `multiprocessing.Pool`, optional
356 If not `None`, a process pool with which to parallelize some
358 processes : `int`, optional
359 The number of processes to use. Ignored if ``pool`` is not `None`.
363 exposure : `RawExposureData`
364 Data structures containing dimension records, filenames, and data
365 IDs to be ingested (one structure for each exposure).
367 if pool
is None and processes > 1:
368 pool = Pool(processes)
369 mapFunc = map
if pool
is None else pool.imap_unordered
397 ) -> List[DatasetRef]:
398 """Ingest all raw files in one exposure.
402 exposure : `RawExposureData`
403 A structure containing information about the exposure to be
404 ingested. Must have `RawExposureData.records` populated and all
405 data ID attributes expanded.
406 run : `str`, optional
407 Name of a RUN-type collection to write to, overriding
412 refs : `list` of `lsst.daf.butler.DatasetRef`
413 Dataset references for ingested raws.
415 datasets = [FileDataset(path=os.path.abspath(file.filename),
416 refs=[DatasetRef(self.
datasetType, d.dataId)
for d
in file.datasets],
417 formatter=file.FormatterClass)
418 for file
in exposure.files]
419 self.
butler.ingest(*datasets, transfer=self.config.transfer, run=run)
420 return [ref
for dataset
in datasets
for ref
in dataset.refs]
422 def run(self, files, *, pool: Optional[Pool] =
None, processes: int = 1, run: Optional[str] =
None):
423 """Ingest files into a Butler data repository.
425 This creates any new exposure or visit Dimension entries needed to
426 identify the ingested files, creates new Dataset entries in the
427 Registry and finally ingests the files themselves into the Datastore.
428 Any needed instrument, detector, and physical_filter Dimension entries
429 must exist in the Registry before `run` is called.
433 files : iterable over `str` or path-like objects
434 Paths to the files to be ingested. Will be made absolute
435 if they are not already.
436 pool : `multiprocessing.Pool`, optional
437 If not `None`, a process pool with which to parallelize some
439 processes : `int`, optional
440 The number of processes to use. Ignored if ``pool`` is not `None`.
441 run : `str`, optional
442 Name of a RUN-type collection to write to, overriding
443 the default derived from the instrument name.
447 refs : `list` of `lsst.daf.butler.DatasetRef`
448 Dataset references for ingested raws.
452 This method inserts all datasets for an exposure within a transaction,
453 guaranteeing that partial exposures are never ingested. The exposure
454 dimension record is inserted with `Registry.syncDimensionData` first
455 (in its own transaction), which inserts only if a record with the same
456 primary key does not already exist. This allows different files within
457 the same exposure to be incremented in different runs.
459 exposureData = self.
prep(files, pool=pool, processes=processes)
471 for exposure
in exposureData:
472 self.
butler.registry.syncDimensionData(
"exposure", exposure.record)
475 instrumentClass = exposure.files[0].instrumentClass
476 this_run = instrumentClass.makeDefaultRawIngestRunName()
479 if this_run
not in runs:
480 self.
butler.registry.registerCollection(this_run, type=CollectionType.RUN)
482 with self.
butler.transaction():