23 __all__ = (
"RawIngestTask",
"RawIngestConfig",
"makeTransferChoiceField")
26 from dataclasses
import dataclass, InitVar
27 from typing
import List, Iterator, Iterable, Type, Optional, Any
28 from collections
import defaultdict
29 from multiprocessing
import Pool
31 from astro_metadata_translator
import ObservationInfo, fix_header, merge_headers
32 from lsst.afw.fits
import readMetadata
33 from lsst.daf.butler
import (
44 from lsst.pex.config
import Config, ChoiceField
45 from lsst.pipe.base
import Task
47 from ._instrument
import Instrument, makeExposureRecordFromObsInfo
48 from ._fitsRawFormatterBase
import FitsRawFormatterBase
53 """Structure that holds information about a single dataset within a
57 dataId: DataCoordinate
58 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
61 obsInfo: ObservationInfo
62 """Standardized observation metadata extracted directly from the file
63 headers (`astro_metadata_translator.ObservationInfo`).
69 """Structure that holds information about a single raw file, used during
73 datasets: List[RawFileDatasetInfo]
74 """The information describing each dataset within this raw file.
75 (`list` of `RawFileDatasetInfo`)
79 """Name of the file this information was extracted from (`str`).
81 This is the path prior to ingest, not the path after ingest.
84 FormatterClass: Type[FitsRawFormatterBase]
85 """Formatter class that should be used to ingest this file (`type`; as
86 subclass of `FitsRawFormatterBase`).
89 instrumentClass: Optional[Type[Instrument]]
90 """The `Instrument` class associated with this file. Can be `None`
91 if ``datasets`` is an empty list."""
96 """Structure that holds information about a complete raw exposure, used
100 dataId: DataCoordinate
101 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
104 files: List[RawFileData]
105 """List of structures containing file-level information.
108 universe: InitVar[DimensionUniverse]
109 """Set of all known dimensions.
112 record: Optional[DimensionRecord] =
None
113 """The exposure `DimensionRecord` that must be inserted into the
114 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
124 """Create a Config field with options for how to transfer files between
127 The allowed options for the field are exactly those supported by
128 `lsst.daf.butler.Datastore.ingest`.
133 Documentation for the configuration field.
137 field : `lsst.pex.config.ChoiceField`
143 allowed={
"move":
"move",
145 "auto":
"choice will depend on datastore",
146 "link":
"hard link falling back to symbolic link",
147 "hardlink":
"hard link",
148 "symlink":
"symbolic (soft) link",
149 "relsymlink":
"relative symbolic link",
161 """Driver Task for ingesting raw data into Gen3 Butler repositories.
165 config : `RawIngestConfig`
166 Configuration for the task.
167 butler : `~lsst.daf.butler.Butler`
168 Writeable butler instance, with ``butler.run`` set to the appropriate
169 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
172 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
177 Each instance of `RawIngestTask` writes to the same Butler. Each
178 invocation of `RawIngestTask.run` ingests a list of files.
181 ConfigClass = RawIngestConfig
183 _DefaultName =
"ingest"
186 """Return the DatasetType of the datasets ingested by this Task.
188 return DatasetType(
"raw", (
"instrument",
"detector",
"exposure"),
"Exposure",
189 universe=self.
butler.registry.dimensions)
191 def __init__(self, config: Optional[RawIngestConfig] =
None, *, butler: Butler, **kwargs: Any):
200 Instrument.importAll(self.
butler.registry)
205 def _makeTask(cls, config: RawIngestConfig, butler: Butler, name: str, parentTask: Task):
206 """Construct a RawIngestTask using only positional arguments.
210 All parameters are as for `RawIngestTask`.
212 return cls(config=config, butler=butler, name=name, parentTask=parentTask)
216 return (self.
_makeTask, (self.config, self.
butler, self._name, self._parentTask))
219 """Extract and process metadata from a single raw file.
229 A structure containing the metadata extracted from the file,
230 as well as the original filename. All fields will be populated,
231 but the `RawFileData.dataId` attribute will be a minimal
232 (unexpanded) `DataCoordinate` instance.
236 Assumes that there is a single dataset associated with the given
237 file. Instruments using a single file to store multiple datasets
238 must implement their own version of this method.
248 phdu = readMetadata(filename, 0)
249 header = merge_headers([phdu, readMetadata(filename)], mode=
"overwrite")
252 except Exception
as e:
253 self.log.debug(
"Problem extracting metadata from %s: %s", filename, e)
256 FormatterClass = Formatter
259 self.log.debug(
"Extracted metadata from file %s", filename)
264 instrument = Instrument.fromName(datasets[0].dataId[
"instrument"], self.
butler.registry)
266 self.log.warning(
"Instrument %s for file %s not known to registry",
267 datasets[0].dataId[
"instrument"], filename)
269 FormatterClass = Formatter
272 FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
274 return RawFileData(datasets=datasets, filename=filename,
275 FormatterClass=FormatterClass,
276 instrumentClass=instrument)
278 def _calculate_dataset_info(self, header, filename):
279 """Calculate a RawFileDatasetInfo from the supplied information.
284 Header from the dataset.
286 Filename to use for error messages.
290 dataset : `RawFileDatasetInfo`
291 The dataId, and observation information associated with this
294 obsInfo = ObservationInfo(header)
295 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
296 exposure=obsInfo.exposure_id,
297 detector=obsInfo.detector_num,
302 """Group an iterable of `RawFileData` by exposure.
306 files : iterable of `RawFileData`
307 File-level information to group.
311 exposures : `list` of `RawExposureData`
312 A list of structures that group the file-level information by
313 exposure. All fields will be populated. The
314 `RawExposureData.dataId` attributes will be minimal (unexpanded)
315 `DataCoordinate` instances.
317 exposureDimensions = self.
universe[
"exposure"].graph
318 byExposure = defaultdict(list)
321 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
324 for dataId, exposureFiles
in byExposure.items()]
327 """Expand the data IDs associated with a raw exposure to include
328 additional metadata records.
332 exposure : `RawExposureData`
333 A structure containing information about the exposure to be
334 ingested. Must have `RawExposureData.records` populated. Should
335 be considered consumed upon return.
339 exposure : `RawExposureData`
340 An updated version of the input structure, with
341 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
342 updated to data IDs for which `DataCoordinate.hasRecords` returns
348 data.dataId = self.
butler.registry.expandDataId(
355 self.
butler.registry.dimensions[
"exposure"]: data.record,
361 for file
in data.files:
362 for dataset
in file.datasets:
363 dataset.dataId = self.
butler.registry.expandDataId(
365 records=dict(data.dataId.records)
369 def prep(self, files, *, pool: Optional[Pool] =
None, processes: int = 1) -> Iterator[RawExposureData]:
370 """Perform all ingest preprocessing steps that do not involve actually
371 modifying the database.
375 files : iterable over `str` or path-like objects
376 Paths to the files to be ingested. Will be made absolute
377 if they are not already.
378 pool : `multiprocessing.Pool`, optional
379 If not `None`, a process pool with which to parallelize some
381 processes : `int`, optional
382 The number of processes to use. Ignored if ``pool`` is not `None`.
386 exposure : `RawExposureData`
387 Data structures containing dimension records, filenames, and data
388 IDs to be ingested (one structure for each exposure).
389 bad_files : `list` of `str`
390 List of all the files that could not have metadata extracted.
392 if pool
is None and processes > 1:
393 pool = Pool(processes)
394 mapFunc = map
if pool
is None else pool.imap_unordered
405 for fileDatum
in fileData:
406 if not fileDatum.datasets:
407 bad_files.append(fileDatum.filename)
409 good_files.append(fileDatum)
410 fileData = good_files
412 self.log.info(
"Successfully extracted metadata from %d file%s with %d failure%s",
413 len(fileData),
"" if len(fileData) == 1
else "s",
414 len(bad_files),
"" if len(bad_files) == 1
else "s")
439 ) -> List[DatasetRef]:
440 """Ingest all raw files in one exposure.
444 exposure : `RawExposureData`
445 A structure containing information about the exposure to be
446 ingested. Must have `RawExposureData.records` populated and all
447 data ID attributes expanded.
448 run : `str`, optional
449 Name of a RUN-type collection to write to, overriding
454 refs : `list` of `lsst.daf.butler.DatasetRef`
455 Dataset references for ingested raws.
457 datasets = [FileDataset(path=os.path.abspath(file.filename),
458 refs=[DatasetRef(self.
datasetType, d.dataId)
for d
in file.datasets],
459 formatter=file.FormatterClass)
460 for file
in exposure.files]
461 self.
butler.ingest(*datasets, transfer=self.config.transfer, run=run)
462 return [ref
for dataset
in datasets
for ref
in dataset.refs]
464 def run(self, files, *, pool: Optional[Pool] =
None, processes: int = 1, run: Optional[str] =
None):
465 """Ingest files into a Butler data repository.
467 This creates any new exposure or visit Dimension entries needed to
468 identify the ingested files, creates new Dataset entries in the
469 Registry and finally ingests the files themselves into the Datastore.
470 Any needed instrument, detector, and physical_filter Dimension entries
471 must exist in the Registry before `run` is called.
475 files : iterable over `str` or path-like objects
476 Paths to the files to be ingested. Will be made absolute
477 if they are not already.
478 pool : `multiprocessing.Pool`, optional
479 If not `None`, a process pool with which to parallelize some
481 processes : `int`, optional
482 The number of processes to use. Ignored if ``pool`` is not `None`.
483 run : `str`, optional
484 Name of a RUN-type collection to write to, overriding
485 the default derived from the instrument name.
489 refs : `list` of `lsst.daf.butler.DatasetRef`
490 Dataset references for ingested raws.
494 This method inserts all datasets for an exposure within a transaction,
495 guaranteeing that partial exposures are never ingested. The exposure
496 dimension record is inserted with `Registry.syncDimensionData` first
497 (in its own transaction), which inserts only if a record with the same
498 primary key does not already exist. This allows different files within
499 the same exposure to be incremented in different runs.
501 exposureData, bad_files = self.
prep(files, pool=pool, processes=processes)
514 n_exposures_failed = 0
516 for exposure
in exposureData:
518 self.log.debug(
"Attempting to ingest %d file%s from exposure %s:%s",
519 len(exposure.files),
"" if len(exposure.files) == 1
else "s",
520 exposure.record.instrument, exposure.record.name)
523 self.
butler.registry.syncDimensionData(
"exposure", exposure.record)
524 except Exception
as e:
525 n_exposures_failed += 1
526 self.log.warning(
"Exposure %s:%s could not be registered: %s",
527 exposure.record.instrument, exposure.record.name, e)
532 instrumentClass = exposure.files[0].instrumentClass
533 this_run = instrumentClass.makeDefaultRawIngestRunName()
536 if this_run
not in runs:
537 self.
butler.registry.registerCollection(this_run, type=CollectionType.RUN)
540 with self.
butler.transaction():
542 except Exception
as e:
543 n_ingests_failed += 1
544 self.log.warning(
"Failed to ingest the following for reason: %s", e)
545 for f
in exposure.files:
546 self.log.warning(
"- %s", f.filename)
551 self.log.info(
"Exposure %s:%s ingested successfully",
552 exposure.record.instrument, exposure.record.name)
558 self.log.warning(
"Could not extract observation metadata from the following:")
560 self.log.warning(
"- %s", f)
562 self.log.info(
"Successfully processed data from %d exposure%s with %d failure%s from exposure"
563 " registration and %d failure%s from file ingest.",
564 n_exposures,
"" if n_exposures == 1
else "s",
565 n_exposures_failed,
"" if n_exposures_failed == 1
else "s",
566 n_ingests_failed,
"" if n_ingests_failed == 1
else "s")
567 if n_exposures_failed > 0
or n_ingests_failed > 0:
569 self.log.info(
"Ingested %d distinct Butler dataset%s",
570 len(refs),
"" if len(refs) == 1
else "s")
573 raise RuntimeError(
"Some failures encountered during ingestion")