Coverage for python/lsst/obs/base/ingest.py : 27%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import os.path
26from dataclasses import dataclass, InitVar
27from typing import List, Iterator, Iterable, Type, Optional, Any
28from collections import defaultdict
29from multiprocessing import Pool
31from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
32from lsst.afw.fits import readMetadata
33from lsst.daf.butler import (
34 Butler,
35 CollectionType,
36 DataCoordinate,
37 DatasetRef,
38 DatasetType,
39 DimensionRecord,
40 DimensionUniverse,
41 FileDataset,
42 Formatter,
43)
44from lsst.pex.config import Config, ChoiceField
45from lsst.pipe.base import Task
47from ._instrument import Instrument, makeExposureRecordFromObsInfo
48from ._fitsRawFormatterBase import FitsRawFormatterBase
51@dataclass
52class RawFileDatasetInfo:
53 """Structure that holds information about a single dataset within a
54 raw file.
55 """
57 dataId: DataCoordinate
58 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
59 """
61 obsInfo: ObservationInfo
62 """Standardized observation metadata extracted directly from the file
63 headers (`astro_metadata_translator.ObservationInfo`).
64 """
67@dataclass
68class RawFileData:
69 """Structure that holds information about a single raw file, used during
70 ingest.
71 """
73 datasets: List[RawFileDatasetInfo]
74 """The information describing each dataset within this raw file.
75 (`list` of `RawFileDatasetInfo`)
76 """
78 filename: str
79 """Name of the file this information was extracted from (`str`).
81 This is the path prior to ingest, not the path after ingest.
82 """
84 FormatterClass: Type[FitsRawFormatterBase]
85 """Formatter class that should be used to ingest this file (`type`; as
86 subclass of `FitsRawFormatterBase`).
87 """
89 instrumentClass: Optional[Type[Instrument]]
90 """The `Instrument` class associated with this file. Can be `None`
91 if ``datasets`` is an empty list."""
94@dataclass
95class RawExposureData:
96 """Structure that holds information about a complete raw exposure, used
97 during ingest.
98 """
100 dataId: DataCoordinate
101 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
102 """
104 files: List[RawFileData]
105 """List of structures containing file-level information.
106 """
108 universe: InitVar[DimensionUniverse]
109 """Set of all known dimensions.
110 """
112 record: Optional[DimensionRecord] = None
113 """The exposure `DimensionRecord` that must be inserted into the
114 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
115 """
117 def __post_init__(self, universe: DimensionUniverse):
118 # We don't care which file or dataset we read metadata from, because
119 # we're assuming they'll all be the same; just use the first ones.
120 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
123def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
124 """Create a Config field with options for how to transfer files between
125 data repositories.
127 The allowed options for the field are exactly those supported by
128 `lsst.daf.butler.Datastore.ingest`.
130 Parameters
131 ----------
132 doc : `str`
133 Documentation for the configuration field.
135 Returns
136 -------
137 field : `lsst.pex.config.ChoiceField`
138 Configuration field.
139 """
140 return ChoiceField(
141 doc=doc,
142 dtype=str,
143 allowed={"move": "move",
144 "copy": "copy",
145 "auto": "choice will depend on datastore",
146 "link": "hard link falling back to symbolic link",
147 "hardlink": "hard link",
148 "symlink": "symbolic (soft) link",
149 "relsymlink": "relative symbolic link",
150 },
151 optional=True,
152 default=default
153 )
156class RawIngestConfig(Config):
157 transfer = makeTransferChoiceField()
160class RawIngestTask(Task):
161 """Driver Task for ingesting raw data into Gen3 Butler repositories.
163 Parameters
164 ----------
165 config : `RawIngestConfig`
166 Configuration for the task.
167 butler : `~lsst.daf.butler.Butler`
168 Writeable butler instance, with ``butler.run`` set to the appropriate
169 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
170 datasets.
171 **kwargs
172 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
173 constructor.
175 Notes
176 -----
177 Each instance of `RawIngestTask` writes to the same Butler. Each
178 invocation of `RawIngestTask.run` ingests a list of files.
179 """
181 ConfigClass = RawIngestConfig
183 _DefaultName = "ingest"
185 def getDatasetType(self):
186 """Return the DatasetType of the datasets ingested by this Task.
187 """
188 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
189 universe=self.butler.registry.dimensions)
191 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any):
192 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
193 super().__init__(config, **kwargs)
194 self.butler = butler
195 self.universe = self.butler.registry.dimensions
196 self.datasetType = self.getDatasetType()
198 # Import all the instrument classes so that we ensure that we
199 # have all the relevant metadata translators loaded.
200 Instrument.importAll(self.butler.registry)
202 def _reduce_kwargs(self):
203 # Add extra parameters to pickle
204 return dict(**super()._reduce_kwargs(), butler=self.butler)
206 def extractMetadata(self, filename: str) -> RawFileData:
207 """Extract and process metadata from a single raw file.
209 Parameters
210 ----------
211 filename : `str`
212 Path to the file.
214 Returns
215 -------
216 data : `RawFileData`
217 A structure containing the metadata extracted from the file,
218 as well as the original filename. All fields will be populated,
219 but the `RawFileData.dataId` attribute will be a minimal
220 (unexpanded) `DataCoordinate` instance.
222 Notes
223 -----
224 Assumes that there is a single dataset associated with the given
225 file. Instruments using a single file to store multiple datasets
226 must implement their own version of this method.
227 """
229 # We do not want to stop ingest if we are given a bad file.
230 # Instead return a RawFileData with no datasets and allow
231 # the caller to report the failure.
233 try:
234 # Manually merge the primary and "first data" headers here because
235 # we do not know in general if an input file has set INHERIT=T.
236 phdu = readMetadata(filename, 0)
237 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
238 fix_header(header)
239 datasets = [self._calculate_dataset_info(header, filename)]
240 except Exception as e:
241 self.log.debug("Problem extracting metadata from %s: %s", filename, e)
242 # Indicate to the caller that we failed to read
243 datasets = []
244 FormatterClass = Formatter
245 instrument = None
246 else:
247 self.log.debug("Extracted metadata from file %s", filename)
248 # The data model currently assumes that whilst multiple datasets
249 # can be associated with a single file, they must all share the
250 # same formatter.
251 try:
252 instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry)
253 except LookupError:
254 self.log.warning("Instrument %s for file %s not known to registry",
255 datasets[0].dataId["instrument"], filename)
256 datasets = []
257 FormatterClass = Formatter
258 instrument = None
259 else:
260 FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
262 return RawFileData(datasets=datasets, filename=filename,
263 FormatterClass=FormatterClass,
264 instrumentClass=instrument)
266 def _calculate_dataset_info(self, header, filename):
267 """Calculate a RawFileDatasetInfo from the supplied information.
269 Parameters
270 ----------
271 header : `Mapping`
272 Header from the dataset.
273 filename : `str`
274 Filename to use for error messages.
276 Returns
277 -------
278 dataset : `RawFileDatasetInfo`
279 The dataId, and observation information associated with this
280 dataset.
281 """
282 obsInfo = ObservationInfo(header)
283 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
284 exposure=obsInfo.exposure_id,
285 detector=obsInfo.detector_num,
286 universe=self.universe)
287 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
289 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
290 """Group an iterable of `RawFileData` by exposure.
292 Parameters
293 ----------
294 files : iterable of `RawFileData`
295 File-level information to group.
297 Returns
298 -------
299 exposures : `list` of `RawExposureData`
300 A list of structures that group the file-level information by
301 exposure. All fields will be populated. The
302 `RawExposureData.dataId` attributes will be minimal (unexpanded)
303 `DataCoordinate` instances.
304 """
305 exposureDimensions = self.universe["exposure"].graph
306 byExposure = defaultdict(list)
307 for f in files:
308 # Assume that the first dataset is representative for the file
309 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
311 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
312 for dataId, exposureFiles in byExposure.items()]
314 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
315 """Expand the data IDs associated with a raw exposure to include
316 additional metadata records.
318 Parameters
319 ----------
320 exposure : `RawExposureData`
321 A structure containing information about the exposure to be
322 ingested. Must have `RawExposureData.records` populated. Should
323 be considered consumed upon return.
325 Returns
326 -------
327 exposure : `RawExposureData`
328 An updated version of the input structure, with
329 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
330 updated to data IDs for which `DataCoordinate.hasRecords` returns
331 `True`.
332 """
333 # We start by expanded the exposure-level data ID; we won't use that
334 # directly in file ingest, but this lets us do some database lookups
335 # once per exposure instead of once per file later.
336 data.dataId = self.butler.registry.expandDataId(
337 data.dataId,
338 # We pass in the records we'll be inserting shortly so they aren't
339 # looked up from the database. We do expect instrument and filter
340 # records to be retrieved from the database here (though the
341 # Registry may cache them so there isn't a lookup every time).
342 records={
343 self.butler.registry.dimensions["exposure"]: data.record,
344 }
345 )
346 # Now we expand the per-file (exposure+detector) data IDs. This time
347 # we pass in the records we just retrieved from the exposure data ID
348 # expansion.
349 for file in data.files:
350 for dataset in file.datasets:
351 dataset.dataId = self.butler.registry.expandDataId(
352 dataset.dataId,
353 records=dict(data.dataId.records)
354 )
355 return data
357 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
358 """Perform all ingest preprocessing steps that do not involve actually
359 modifying the database.
361 Parameters
362 ----------
363 files : iterable over `str` or path-like objects
364 Paths to the files to be ingested. Will be made absolute
365 if they are not already.
366 pool : `multiprocessing.Pool`, optional
367 If not `None`, a process pool with which to parallelize some
368 operations.
369 processes : `int`, optional
370 The number of processes to use. Ignored if ``pool`` is not `None`.
372 Yields
373 ------
374 exposure : `RawExposureData`
375 Data structures containing dimension records, filenames, and data
376 IDs to be ingested (one structure for each exposure).
377 bad_files : `list` of `str`
378 List of all the files that could not have metadata extracted.
379 """
380 if pool is None and processes > 1:
381 pool = Pool(processes)
382 mapFunc = map if pool is None else pool.imap_unordered
384 # Extract metadata and build per-detector regions.
385 # This could run in a subprocess so collect all output
386 # before looking at failures.
387 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
389 # Filter out all the failed reads and store them for later
390 # reporting
391 good_files = []
392 bad_files = []
393 for fileDatum in fileData:
394 if not fileDatum.datasets:
395 bad_files.append(fileDatum.filename)
396 else:
397 good_files.append(fileDatum)
398 fileData = good_files
400 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s",
401 len(fileData), "" if len(fileData) == 1 else "s",
402 len(bad_files), "" if len(bad_files) == 1 else "s")
404 # Use that metadata to group files (and extracted metadata) by
405 # exposure. Never parallelized because it's intrinsically a gather
406 # step.
407 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
409 # The next operation operates on RawExposureData instances (one at
410 # a time) in-place and then returns the modified instance. We call it
411 # as a pass-through instead of relying on the arguments we pass in to
412 # have been modified because in the parallel case those arguments are
413 # going to be pickled and unpickled, and I'm not certain
414 # multiprocessing is careful enough with that for output arguments to
415 # work.
417 # Expand the data IDs to include all dimension metadata; we need this
418 # because we may need to generate path templates that rely on that
419 # metadata.
420 # This is the first step that involves actual database calls (but just
421 # SELECTs), so if there's going to be a problem with connections vs.
422 # multiple processes, or lock contention (in SQLite) slowing things
423 # down, it'll happen here.
424 return mapFunc(self.expandDataIds, exposureData), bad_files
426 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
427 ) -> List[DatasetRef]:
428 """Ingest all raw files in one exposure.
430 Parameters
431 ----------
432 exposure : `RawExposureData`
433 A structure containing information about the exposure to be
434 ingested. Must have `RawExposureData.records` populated and all
435 data ID attributes expanded.
436 run : `str`, optional
437 Name of a RUN-type collection to write to, overriding
438 ``self.butler.run``.
440 Returns
441 -------
442 refs : `list` of `lsst.daf.butler.DatasetRef`
443 Dataset references for ingested raws.
444 """
445 datasets = [FileDataset(path=os.path.abspath(file.filename),
446 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
447 formatter=file.FormatterClass)
448 for file in exposure.files]
449 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
450 return [ref for dataset in datasets for ref in dataset.refs]
452 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None):
453 """Ingest files into a Butler data repository.
455 This creates any new exposure or visit Dimension entries needed to
456 identify the ingested files, creates new Dataset entries in the
457 Registry and finally ingests the files themselves into the Datastore.
458 Any needed instrument, detector, and physical_filter Dimension entries
459 must exist in the Registry before `run` is called.
461 Parameters
462 ----------
463 files : iterable over `str` or path-like objects
464 Paths to the files to be ingested. Will be made absolute
465 if they are not already.
466 pool : `multiprocessing.Pool`, optional
467 If not `None`, a process pool with which to parallelize some
468 operations.
469 processes : `int`, optional
470 The number of processes to use. Ignored if ``pool`` is not `None`.
471 run : `str`, optional
472 Name of a RUN-type collection to write to, overriding
473 the default derived from the instrument name.
475 Returns
476 -------
477 refs : `list` of `lsst.daf.butler.DatasetRef`
478 Dataset references for ingested raws.
480 Notes
481 -----
482 This method inserts all datasets for an exposure within a transaction,
483 guaranteeing that partial exposures are never ingested. The exposure
484 dimension record is inserted with `Registry.syncDimensionData` first
485 (in its own transaction), which inserts only if a record with the same
486 primary key does not already exist. This allows different files within
487 the same exposure to be incremented in different runs.
488 """
489 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
490 # Up to this point, we haven't modified the data repository at all.
491 # Now we finally do that, with one transaction per exposure. This is
492 # not parallelized at present because the performance of this step is
493 # limited by the database server. That may or may not change in the
494 # future once we increase our usage of bulk inserts and reduce our
495 # usage of savepoints; we've tried to get everything but the database
496 # operations done in advance to reduce the time spent inside
497 # transactions.
498 self.butler.registry.registerDatasetType(self.datasetType)
499 refs = []
500 runs = set()
501 n_exposures = 0
502 n_exposures_failed = 0
503 n_ingests_failed = 0
504 for exposure in exposureData:
506 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s",
507 len(exposure.files), "" if len(exposure.files) == 1 else "s",
508 exposure.record.instrument, exposure.record.name)
510 try:
511 self.butler.registry.syncDimensionData("exposure", exposure.record)
512 except Exception as e:
513 n_exposures_failed += 1
514 self.log.warning("Exposure %s:%s could not be registered: %s",
515 exposure.record.instrument, exposure.record.name, e)
516 continue
518 # Override default run if nothing specified explicitly
519 if run is None:
520 instrumentClass = exposure.files[0].instrumentClass
521 this_run = instrumentClass.makeDefaultRawIngestRunName()
522 else:
523 this_run = run
524 if this_run not in runs:
525 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
526 runs.add(this_run)
527 try:
528 with self.butler.transaction():
529 refs.extend(self.ingestExposureDatasets(exposure, run=this_run))
530 except Exception as e:
531 n_ingests_failed += 1
532 self.log.warning("Failed to ingest the following for reason: %s", e)
533 for f in exposure.files:
534 self.log.warning("- %s", f.filename)
535 continue
537 # Success for this exposure
538 n_exposures += 1
539 self.log.info("Exposure %s:%s ingested successfully",
540 exposure.record.instrument, exposure.record.name)
542 had_failure = False
544 if bad_files:
545 had_failure = True
546 self.log.warning("Could not extract observation metadata from the following:")
547 for f in bad_files:
548 self.log.warning("- %s", f)
550 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure"
551 " registration and %d failure%s from file ingest.",
552 n_exposures, "" if n_exposures == 1 else "s",
553 n_exposures_failed, "" if n_exposures_failed == 1 else "s",
554 n_ingests_failed, "" if n_ingests_failed == 1 else "s")
555 if n_exposures_failed > 0 or n_ingests_failed > 0:
556 had_failure = True
557 self.log.info("Ingested %d distinct Butler dataset%s",
558 len(refs), "" if len(refs) == 1 else "s")
560 if had_failure:
561 raise RuntimeError("Some failures encountered during ingestion")
563 return refs