23 __all__ = (
"RawIngestTask",
"RawIngestConfig",
"makeTransferChoiceField")
26 from dataclasses
import dataclass, InitVar
27 from typing
import List, Iterator, Iterable, Type, Optional, Any
28 from collections
import defaultdict
29 from multiprocessing
import Pool
31 from astro_metadata_translator
import ObservationInfo, merge_headers
32 from lsst.afw.fits
import readMetadata
33 from lsst.daf.butler
import (
44 from lsst.pex.config
import Config, ChoiceField
45 from lsst.pipe.base
import Task
47 from ._instrument
import Instrument, makeExposureRecordFromObsInfo
48 from ._fitsRawFormatterBase
import FitsRawFormatterBase
53 """Structure that holds information about a single dataset within a
57 dataId: DataCoordinate
58 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
61 obsInfo: ObservationInfo
62 """Standardized observation metadata extracted directly from the file
63 headers (`astro_metadata_translator.ObservationInfo`).
69 """Structure that holds information about a single raw file, used during
73 datasets: List[RawFileDatasetInfo]
74 """The information describing each dataset within this raw file.
75 (`list` of `RawFileDatasetInfo`)
79 """Name of the file this information was extracted from (`str`).
81 This is the path prior to ingest, not the path after ingest.
84 FormatterClass: Type[FitsRawFormatterBase]
85 """Formatter class that should be used to ingest this file (`type`; as
86 subclass of `FitsRawFormatterBase`).
89 instrumentClass: Optional[Type[Instrument]]
90 """The `Instrument` class associated with this file. Can be `None`
91 if ``datasets`` is an empty list."""
96 """Structure that holds information about a complete raw exposure, used
100 dataId: DataCoordinate
101 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
104 files: List[RawFileData]
105 """List of structures containing file-level information.
108 universe: InitVar[DimensionUniverse]
109 """Set of all known dimensions.
112 record: Optional[DimensionRecord] =
None
113 """The exposure `DimensionRecord` that must be inserted into the
114 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
124 """Create a Config field with options for how to transfer files between
127 The allowed options for the field are exactly those supported by
128 `lsst.daf.butler.Datastore.ingest`.
133 Documentation for the configuration field.
137 field : `lsst.pex.config.ChoiceField`
143 allowed={
"move":
"move",
145 "auto":
"choice will depend on datastore",
146 "link":
"hard link falling back to symbolic link",
147 "hardlink":
"hard link",
148 "symlink":
"symbolic (soft) link",
149 "relsymlink":
"relative symbolic link",
161 """Driver Task for ingesting raw data into Gen3 Butler repositories.
165 config : `RawIngestConfig`
166 Configuration for the task.
167 butler : `~lsst.daf.butler.Butler`
168 Writeable butler instance, with ``butler.run`` set to the appropriate
169 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
172 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
177 Each instance of `RawIngestTask` writes to the same Butler. Each
178 invocation of `RawIngestTask.run` ingests a list of files.
181 ConfigClass = RawIngestConfig
183 _DefaultName =
"ingest"
186 """Return the DatasetType of the datasets ingested by this Task.
188 return DatasetType(
"raw", (
"instrument",
"detector",
"exposure"),
"Exposure",
189 universe=self.
butler.registry.dimensions)
191 def __init__(self, config: Optional[RawIngestConfig] =
None, *, butler: Butler, **kwargs: Any):
200 Instrument.importAll(self.
butler.registry)
202 def _reduce_kwargs(self):
204 return dict(**super()._reduce_kwargs(), butler=self.
butler)
207 """Extract and process metadata from a single raw file.
217 A structure containing the metadata extracted from the file,
218 as well as the original filename. All fields will be populated,
219 but the `RawFileData.dataId` attribute will be a minimal
220 (unexpanded) `DataCoordinate` instance.
224 Assumes that there is a single dataset associated with the given
225 file. Instruments using a single file to store multiple datasets
226 must implement their own version of this method.
236 phdu = readMetadata(filename, 0)
237 header = merge_headers([phdu, readMetadata(filename)], mode=
"overwrite")
239 except Exception
as e:
240 self.log.debug(
"Problem extracting metadata from %s: %s", filename, e)
243 FormatterClass = Formatter
246 self.log.debug(
"Extracted metadata from file %s", filename)
251 instrument = Instrument.fromName(datasets[0].dataId[
"instrument"], self.
butler.registry)
253 self.log.warning(
"Instrument %s for file %s not known to registry",
254 datasets[0].dataId[
"instrument"], filename)
256 FormatterClass = Formatter
259 FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
261 return RawFileData(datasets=datasets, filename=filename,
262 FormatterClass=FormatterClass,
263 instrumentClass=instrument)
265 def _calculate_dataset_info(self, header, filename):
266 """Calculate a RawFileDatasetInfo from the supplied information.
271 Header from the dataset.
273 Filename to use for error messages.
277 dataset : `RawFileDatasetInfo`
278 The dataId, and observation information associated with this
286 "altaz_begin":
False,
287 "boresight_rotation_coord":
False,
288 "boresight_rotation_angle":
False,
290 "datetime_begin":
True,
291 "datetime_end":
True,
292 "detector_num":
True,
293 "exposure_group":
False,
295 "exposure_time":
True,
297 "tracking_radec":
False,
299 "observation_counter":
False,
300 "observation_id":
True,
301 "observation_reason":
False,
302 "observation_type":
True,
303 "observing_day":
False,
304 "physical_filter":
True,
305 "science_program":
False,
309 obsInfo = ObservationInfo(header, pedantic=
False, filename=filename,
310 required={k
for k
in ingest_subset
if ingest_subset[k]},
311 subset=set(ingest_subset))
313 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
314 exposure=obsInfo.exposure_id,
315 detector=obsInfo.detector_num,
320 """Group an iterable of `RawFileData` by exposure.
324 files : iterable of `RawFileData`
325 File-level information to group.
329 exposures : `list` of `RawExposureData`
330 A list of structures that group the file-level information by
331 exposure. All fields will be populated. The
332 `RawExposureData.dataId` attributes will be minimal (unexpanded)
333 `DataCoordinate` instances.
335 exposureDimensions = self.
universe[
"exposure"].graph
336 byExposure = defaultdict(list)
339 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
342 for dataId, exposureFiles
in byExposure.items()]
345 """Expand the data IDs associated with a raw exposure to include
346 additional metadata records.
350 exposure : `RawExposureData`
351 A structure containing information about the exposure to be
352 ingested. Must have `RawExposureData.records` populated. Should
353 be considered consumed upon return.
357 exposure : `RawExposureData`
358 An updated version of the input structure, with
359 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
360 updated to data IDs for which `DataCoordinate.hasRecords` returns
366 data.dataId = self.
butler.registry.expandDataId(
373 self.
butler.registry.dimensions[
"exposure"]: data.record,
379 for file
in data.files:
380 for dataset
in file.datasets:
381 dataset.dataId = self.
butler.registry.expandDataId(
383 records=dict(data.dataId.records)
387 def prep(self, files, *, pool: Optional[Pool] =
None, processes: int = 1) -> Iterator[RawExposureData]:
388 """Perform all ingest preprocessing steps that do not involve actually
389 modifying the database.
393 files : iterable over `str` or path-like objects
394 Paths to the files to be ingested. Will be made absolute
395 if they are not already.
396 pool : `multiprocessing.Pool`, optional
397 If not `None`, a process pool with which to parallelize some
399 processes : `int`, optional
400 The number of processes to use. Ignored if ``pool`` is not `None`.
404 exposure : `RawExposureData`
405 Data structures containing dimension records, filenames, and data
406 IDs to be ingested (one structure for each exposure).
407 bad_files : `list` of `str`
408 List of all the files that could not have metadata extracted.
410 if pool
is None and processes > 1:
411 pool = Pool(processes)
412 mapFunc = map
if pool
is None else pool.imap_unordered
423 for fileDatum
in fileData:
424 if not fileDatum.datasets:
425 bad_files.append(fileDatum.filename)
427 good_files.append(fileDatum)
428 fileData = good_files
430 self.log.info(
"Successfully extracted metadata from %d file%s with %d failure%s",
431 len(fileData),
"" if len(fileData) == 1
else "s",
432 len(bad_files),
"" if len(bad_files) == 1
else "s")
457 ) -> List[DatasetRef]:
458 """Ingest all raw files in one exposure.
462 exposure : `RawExposureData`
463 A structure containing information about the exposure to be
464 ingested. Must have `RawExposureData.records` populated and all
465 data ID attributes expanded.
466 run : `str`, optional
467 Name of a RUN-type collection to write to, overriding
472 refs : `list` of `lsst.daf.butler.DatasetRef`
473 Dataset references for ingested raws.
475 datasets = [FileDataset(path=os.path.abspath(file.filename),
476 refs=[DatasetRef(self.
datasetType, d.dataId)
for d
in file.datasets],
477 formatter=file.FormatterClass)
478 for file
in exposure.files]
479 self.
butler.ingest(*datasets, transfer=self.config.transfer, run=run)
480 return [ref
for dataset
in datasets
for ref
in dataset.refs]
482 def run(self, files, *, pool: Optional[Pool] =
None, processes: int = 1, run: Optional[str] =
None):
483 """Ingest files into a Butler data repository.
485 This creates any new exposure or visit Dimension entries needed to
486 identify the ingested files, creates new Dataset entries in the
487 Registry and finally ingests the files themselves into the Datastore.
488 Any needed instrument, detector, and physical_filter Dimension entries
489 must exist in the Registry before `run` is called.
493 files : iterable over `str` or path-like objects
494 Paths to the files to be ingested. Will be made absolute
495 if they are not already.
496 pool : `multiprocessing.Pool`, optional
497 If not `None`, a process pool with which to parallelize some
499 processes : `int`, optional
500 The number of processes to use. Ignored if ``pool`` is not `None`.
501 run : `str`, optional
502 Name of a RUN-type collection to write to, overriding
503 the default derived from the instrument name.
507 refs : `list` of `lsst.daf.butler.DatasetRef`
508 Dataset references for ingested raws.
512 This method inserts all datasets for an exposure within a transaction,
513 guaranteeing that partial exposures are never ingested. The exposure
514 dimension record is inserted with `Registry.syncDimensionData` first
515 (in its own transaction), which inserts only if a record with the same
516 primary key does not already exist. This allows different files within
517 the same exposure to be incremented in different runs.
519 exposureData, bad_files = self.
prep(files, pool=pool, processes=processes)
532 n_exposures_failed = 0
534 for exposure
in exposureData:
536 self.log.debug(
"Attempting to ingest %d file%s from exposure %s:%s",
537 len(exposure.files),
"" if len(exposure.files) == 1
else "s",
538 exposure.record.instrument, exposure.record.obs_id)
541 self.
butler.registry.syncDimensionData(
"exposure", exposure.record)
542 except Exception
as e:
543 n_exposures_failed += 1
544 self.log.warning(
"Exposure %s:%s could not be registered: %s",
545 exposure.record.instrument, exposure.record.obs_id, e)
550 instrumentClass = exposure.files[0].instrumentClass
551 this_run = instrumentClass.makeDefaultRawIngestRunName()
554 if this_run
not in runs:
555 self.
butler.registry.registerCollection(this_run, type=CollectionType.RUN)
558 with self.
butler.transaction():
560 except Exception
as e:
561 n_ingests_failed += 1
562 self.log.warning(
"Failed to ingest the following for reason: %s", e)
563 for f
in exposure.files:
564 self.log.warning(
"- %s", f.filename)
569 self.log.info(
"Exposure %s:%s ingested successfully",
570 exposure.record.instrument, exposure.record.obs_id)
576 self.log.warning(
"Could not extract observation metadata from the following:")
578 self.log.warning(
"- %s", f)
580 self.log.info(
"Successfully processed data from %d exposure%s with %d failure%s from exposure"
581 " registration and %d failure%s from file ingest.",
582 n_exposures,
"" if n_exposures == 1
else "s",
583 n_exposures_failed,
"" if n_exposures_failed == 1
else "s",
584 n_ingests_failed,
"" if n_ingests_failed == 1
else "s")
585 if n_exposures_failed > 0
or n_ingests_failed > 0:
587 self.log.info(
"Ingested %d distinct Butler dataset%s",
588 len(refs),
"" if len(refs) == 1
else "s")
591 raise RuntimeError(
"Some failures encountered during ingestion")