Coverage for python/lsst/obs/base/ingest.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import os.path
26from dataclasses import dataclass, InitVar
27from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union
28from collections import defaultdict
29from multiprocessing import Pool
31from astro_metadata_translator import ObservationInfo, merge_headers
32from astro_metadata_translator.indexing import read_sidecar, read_index
33from lsst.afw.fits import readMetadata
34from lsst.daf.butler import (
35 Butler,
36 CollectionType,
37 DataCoordinate,
38 DatasetRef,
39 DatasetType,
40 DimensionRecord,
41 DimensionUniverse,
42 FileDataset,
43 Formatter,
44)
45from lsst.pex.config import Config, ChoiceField, Field
46from lsst.pipe.base import Task, timeMethod
48from ._instrument import Instrument, makeExposureRecordFromObsInfo
49from ._fitsRawFormatterBase import FitsRawFormatterBase
52def _do_nothing(*args, **kwargs) -> None:
53 """Do nothing.
55 This is a function that accepts anything and does nothing.
56 For use as a default in callback arguments.
57 """
58 pass
61def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]:
62 """Count the iterable and return the count and plural modifier.
64 Parameters
65 ----------
66 noun : Iterable or `int`
67 Thing to count. If given an integer it is assumed to be the count
68 to use to calculate modifier.
70 Returns
71 -------
72 num : `int`
73 Number of items found in ``noun``.
74 modifier : `str`
75 Character to add to the end of a string referring to these items
76 to indicate whether it was a single item or not. Returns empty
77 string if there is one item or "s" otherwise.
79 Examples
80 --------
82 .. code-block:: python
84 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
85 """
86 if isinstance(noun, int):
87 num = noun
88 else:
89 num = len(noun)
90 return num, "" if num == 1 else "s"
93@dataclass
94class RawFileDatasetInfo:
95 """Information about a single dataset within a raw file."""
97 dataId: DataCoordinate
98 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
100 obsInfo: ObservationInfo
101 """Standardized observation metadata extracted directly from the file
102 headers (`astro_metadata_translator.ObservationInfo`).
103 """
106@dataclass
107class RawFileData:
108 """Information about a single raw file, used during ingest."""
110 datasets: List[RawFileDatasetInfo]
111 """The information describing each dataset within this raw file.
112 (`list` of `RawFileDatasetInfo`)
113 """
115 filename: str
116 """Name of the file this information was extracted from (`str`).
118 This is the path prior to ingest, not the path after ingest.
119 """
121 FormatterClass: Type[FitsRawFormatterBase]
122 """Formatter class that should be used to ingest this file (`type`; as
123 subclass of `FitsRawFormatterBase`).
124 """
126 instrumentClass: Optional[Type[Instrument]]
127 """The `Instrument` class associated with this file. Can be `None`
128 if ``datasets`` is an empty list."""
131@dataclass
132class RawExposureData:
133 """Information about a complete raw exposure, used during ingest."""
135 dataId: DataCoordinate
136 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
137 """
139 files: List[RawFileData]
140 """List of structures containing file-level information.
141 """
143 universe: InitVar[DimensionUniverse]
144 """Set of all known dimensions.
145 """
147 record: Optional[DimensionRecord] = None
148 """The exposure `DimensionRecord` that must be inserted into the
149 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
150 """
152 def __post_init__(self, universe: DimensionUniverse):
153 # We don't care which file or dataset we read metadata from, because
154 # we're assuming they'll all be the same; just use the first ones.
155 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
158def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
159 """Create a Config field with options for transferring data between repos.
161 The allowed options for the field are exactly those supported by
162 `lsst.daf.butler.Datastore.ingest`.
164 Parameters
165 ----------
166 doc : `str`
167 Documentation for the configuration field.
169 Returns
170 -------
171 field : `lsst.pex.config.ChoiceField`
172 Configuration field.
173 """
174 return ChoiceField(
175 doc=doc,
176 dtype=str,
177 allowed={"move": "move",
178 "copy": "copy",
179 "auto": "choice will depend on datastore",
180 "direct": "use URI to ingested file directly in datastore",
181 "link": "hard link falling back to symbolic link",
182 "hardlink": "hard link",
183 "symlink": "symbolic (soft) link",
184 "relsymlink": "relative symbolic link",
185 },
186 optional=True,
187 default=default
188 )
191class RawIngestConfig(Config):
192 """Configuration class for RawIngestTask."""
194 transfer = makeTransferChoiceField()
195 failFast = Field(
196 dtype=bool,
197 default=False,
198 doc="If True, stop ingest as soon as any problem is encountered with any file. "
199 "Otherwise problems files will be skipped and logged and a report issued at completion.",
200 )
203class RawIngestTask(Task):
204 """Driver Task for ingesting raw data into Gen3 Butler repositories.
206 Parameters
207 ----------
208 config : `RawIngestConfig`
209 Configuration for the task.
210 butler : `~lsst.daf.butler.Butler`
211 Writeable butler instance, with ``butler.run`` set to the appropriate
212 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
213 datasets.
214 on_success : `Callable`, optional
215 A callback invoked when all of the raws associated with an exposure
216 are ingested. Will be passed a list of `FileDataset` objects, each
217 containing one or more resolved `DatasetRef` objects. If this callback
218 raises it will interrupt the entire ingest process, even if
219 `RawIngestConfig.failFast` is `False`.
220 on_metadata_failure : `Callable`, optional
221 A callback invoked when a failure occurs trying to translate the
222 metadata for a file. Will be passed the filename and the exception, in
223 that order, as positional arguments. Guaranteed to be called in an
224 ``except`` block, allowing the callback to re-raise or replace (with
225 ``raise ... from``) to override the task's usual error handling (before
226 `RawIngestConfig.failFast` logic occurs).
227 on_ingest_failure : `Callable`, optional
228 A callback invoked when dimension record or dataset insertion into the
229 database fails for an exposure. Will be passed a `RawExposureData`
230 instance and the exception, in that order, as positional arguments.
231 Guaranteed to be called in an ``except`` block, allowing the callback
232 to re-raise or replace (with ``raise ... from``) to override the task's
233 usual error handling (before `RawIngestConfig.failFast` logic occurs).
234 **kwargs
235 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
236 constructor.
238 Notes
239 -----
240 Each instance of `RawIngestTask` writes to the same Butler. Each
241 invocation of `RawIngestTask.run` ingests a list of files.
242 """
244 ConfigClass = RawIngestConfig
246 _DefaultName = "ingest"
248 def getDatasetType(self):
249 """Return the DatasetType of the datasets ingested by this Task."""
250 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
251 universe=self.butler.registry.dimensions)
253 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler,
254 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
255 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing,
256 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
257 **kwargs: Any):
258 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
259 super().__init__(config, **kwargs)
260 self.butler = butler
261 self.universe = self.butler.registry.dimensions
262 self.datasetType = self.getDatasetType()
263 self._on_success = on_success
264 self._on_metadata_failure = on_metadata_failure
265 self._on_ingest_failure = on_ingest_failure
267 # Import all the instrument classes so that we ensure that we
268 # have all the relevant metadata translators loaded.
269 Instrument.importAll(self.butler.registry)
271 def _reduce_kwargs(self):
272 # Add extra parameters to pickle.
273 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success,
274 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure)
276 def _determine_instrument_formatter(self, dataId, filename):
277 """Determine the instrument and formatter class.
279 Parameters
280 ----------
281 dataId : `lsst.daf.butler.DataCoordinate`
282 The dataId associated with this dataset.
283 filename : `str`
284 Filename used for error reporting.
286 Returns
287 -------
288 instrument : `Instrument` or `None`
289 Instance of the `Instrument` associated with this dataset. `None`
290 indicates that the instrument could not be determined.
291 formatterClass : `type`
292 Class to be used as the formatter for this dataset.
293 """
294 # The data model currently assumes that whilst multiple datasets
295 # can be associated with a single file, they must all share the
296 # same formatter.
297 try:
298 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry)
299 except LookupError as e:
300 self._on_metadata_failure(filename, e)
301 self.log.warning("Instrument %s for file %s not known to registry",
302 dataId["instrument"], filename)
303 if self.config.failFast:
304 raise RuntimeError(f"Instrument {dataId['instrument']} for"
305 f" file {filename} not known to registry") from e
306 FormatterClass = Formatter
307 # Indicate that we could not work out the instrument.
308 instrument = None
309 else:
310 FormatterClass = instrument.getRawFormatter(dataId)
311 return instrument, FormatterClass
313 def extractMetadata(self, filename: str) -> RawFileData:
314 """Extract and process metadata from a single raw file.
316 Parameters
317 ----------
318 filename : `str`
319 Path to the file.
321 Returns
322 -------
323 data : `RawFileData`
324 A structure containing the metadata extracted from the file,
325 as well as the original filename. All fields will be populated,
326 but the `RawFileData.dataId` attribute will be a minimal
327 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
328 ``instrumentClass`` field will be `None` if there is a problem
329 with metadata extraction.
331 Notes
332 -----
333 Assumes that there is a single dataset associated with the given
334 file. Instruments using a single file to store multiple datasets
335 must implement their own version of this method.
337 By default the method will catch all exceptions unless the ``failFast``
338 configuration item is `True`. If an error is encountered the
339 `_on_metadata_failure()` method will be called. If no exceptions
340 result and an error was encountered the returned object will have
341 a null-instrument class and no datasets.
343 This method supports sidecar JSON files which can be used to
344 extract metadata without having to read the data file itself.
345 The sidecar file is always used if found.
346 """
347 sidecar_fail_msg = "" # Requires prepended space when set.
348 try:
349 root, ext = os.path.splitext(filename)
350 sidecar_file = root + ".json"
351 if os.path.exists(sidecar_file):
352 header = read_sidecar(sidecar_file)
353 sidecar_fail_msg = " (via sidecar)"
354 else:
355 # Read the metadata from the data file itself.
356 # Manually merge the primary and "first data" headers here
357 # because we do not know in general if an input file has
358 # set INHERIT=T.
359 phdu = readMetadata(filename, 0)
360 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
361 datasets = [self._calculate_dataset_info(header, filename)]
362 except Exception as e:
363 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
364 # Indicate to the caller that we failed to read.
365 datasets = []
366 formatterClass = Formatter
367 instrument = None
368 self._on_metadata_failure(filename, e)
369 if self.config.failFast:
370 raise RuntimeError("Problem extracting metadata for file "
371 f"{filename}{sidecar_fail_msg}") from e
372 else:
373 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
374 # The data model currently assumes that whilst multiple datasets
375 # can be associated with a single file, they must all share the
376 # same formatter.
377 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
378 if instrument is None:
379 datasets = []
381 return RawFileData(datasets=datasets, filename=filename,
382 FormatterClass=formatterClass,
383 instrumentClass=instrument)
385 def _calculate_dataset_info(self, header, filename):
386 """Calculate a RawFileDatasetInfo from the supplied information.
388 Parameters
389 ----------
390 header : Mapping or `astro_metadata_translator.ObservationInfo`
391 Header from the dataset or previously-translated content.
392 filename : `str`
393 Filename to use for error messages.
395 Returns
396 -------
397 dataset : `RawFileDatasetInfo`
398 The dataId, and observation information associated with this
399 dataset.
400 """
401 # To ensure we aren't slowed down for no reason, explicitly
402 # list here the properties we need for the schema.
403 # Use a dict with values a boolean where True indicates
404 # that it is required that we calculate this property.
405 ingest_subset = {
406 "altaz_begin": False,
407 "boresight_rotation_coord": False,
408 "boresight_rotation_angle": False,
409 "dark_time": False,
410 "datetime_begin": True,
411 "datetime_end": True,
412 "detector_num": True,
413 "exposure_group": False,
414 "exposure_id": True,
415 "exposure_time": True,
416 "instrument": True,
417 "tracking_radec": False,
418 "object": False,
419 "observation_counter": False,
420 "observation_id": True,
421 "observation_reason": False,
422 "observation_type": True,
423 "observing_day": False,
424 "physical_filter": True,
425 "science_program": False,
426 "visit_id": False,
427 }
429 if isinstance(header, ObservationInfo):
430 obsInfo = header
431 missing = []
432 # Need to check the required properties are present.
433 for property, required in ingest_subset.items():
434 if not required:
435 continue
436 # getattr does not need to be protected because it is using
437 # the defined list above containing properties that must exist.
438 value = getattr(obsInfo, property)
439 if value is None:
440 missing.append(property)
441 if missing:
442 raise ValueError(f"Requested required properties are missing from file {filename}:"
443 f" {missing} (via JSON)")
445 else:
446 obsInfo = ObservationInfo(header, pedantic=False, filename=filename,
447 required={k for k in ingest_subset if ingest_subset[k]},
448 subset=set(ingest_subset))
450 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
451 exposure=obsInfo.exposure_id,
452 detector=obsInfo.detector_num,
453 universe=self.universe)
454 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
456 def locateAndReadIndexFiles(self, files):
457 """Given a list of files, look for index files and read them.
459 Index files can either be explicitly in the list of files to
460 ingest, or else located in the same directory as a file to ingest.
461 Index entries are always used if present.
463 Parameters
464 ----------
465 files : iterable over `str` or path-like objects
466 Paths to the files to be ingested. Will be made absolute
467 if they are not already.
469 Returns
470 -------
471 index : `dict` [`str`, Any]
472 Merged contents of all relevant index files found. These can
473 be explicitly specified index files or ones found in the
474 directory alongside a data file to be ingested.
475 updated_files : iterable of `str`
476 Updated list of the input files with entries removed that were
477 found listed in an index file. Order is not guaranteed to
478 match the order of the files given to this routine.
479 bad_index_files: `set[str]`
480 Files that looked like index files but failed to read properly.
481 """
482 # Convert the paths to absolute for easy comparison with index content.
483 # Do not convert to real paths since we have to assume that index
484 # files are in this location and not the location which it links to.
485 files = tuple(os.path.abspath(f) for f in files)
487 # Index files must be named this.
488 index_root_file = "_index.json"
490 # Group the files by directory.
491 files_by_directory = defaultdict(set)
493 for path in files:
494 directory, file_in_dir = os.path.split(path)
495 files_by_directory[directory].add(file_in_dir)
497 # All the metadata read from index files with keys of full path.
498 index_entries = {}
500 # Index files we failed to read.
501 bad_index_files = set()
503 # Any good index files that were found and used.
504 good_index_files = set()
506 # Look for index files in those directories.
507 for directory, files_in_directory in files_by_directory.items():
508 possible_index_file = os.path.join(directory, index_root_file)
509 if os.path.exists(possible_index_file):
510 # If we are explicitly requesting an index file the
511 # messages should be different.
512 index_msg = "inferred"
513 is_implied = True
514 if index_root_file in files_in_directory:
515 index_msg = "explicit"
516 is_implied = False
518 # Try to read the index file and catch and report any
519 # problems.
520 try:
521 index = read_index(possible_index_file, force_dict=True)
522 except Exception as e:
523 # Only trigger the callback if the index file
524 # was asked for explicitly. Triggering on implied file
525 # might be surprising.
526 if not is_implied:
527 self._on_metadata_failure(possible_index_file, e)
528 if self.config.failFast:
529 raise RuntimeError(f"Problem reading index file from {index_msg} "
530 f"location {possible_index_file}") from e
531 bad_index_files.add(possible_index_file)
532 continue
534 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
535 good_index_files.add(possible_index_file)
537 # Go through the index adding entries for files.
538 # If we have non-index files in this directory marked for
539 # ingest we should only get index information for those.
540 # If the index file was explicit we use all entries.
541 if is_implied:
542 files_to_ingest = files_in_directory
543 else:
544 files_to_ingest = set(index)
546 # Copy relevant metadata into a single dict for all index
547 # entries.
548 for file_in_dir in files_to_ingest:
549 # Skip an explicitly specified index file.
550 # This should never happen because an explicit index
551 # file will force ingest of all files in the index
552 # and not use the explicit file list. If somehow
553 # this is not true we continue. Raising an exception
554 # seems like the wrong thing to do since this is harmless.
555 if file_in_dir == index_root_file:
556 self.log.info("Logic error found scanning directory %s. Please file ticket.",
557 directory)
558 continue
559 if file_in_dir in index:
560 file = os.path.abspath(os.path.join(directory, file_in_dir))
561 if file in index_entries:
562 # ObservationInfo overrides raw metadata
563 if isinstance(index[file_in_dir], ObservationInfo) \
564 and not isinstance(index_entries[file], ObservationInfo):
565 self.log.warning("File %s already specified in an index file but overriding"
566 " with ObservationInfo content from %s",
567 file, possible_index_file)
568 else:
569 self.log.warning("File %s already specified in an index file, "
570 "ignoring content from %s", file, possible_index_file)
571 # Do nothing in this case
572 continue
574 index_entries[file] = index[file_in_dir]
576 # Remove files from list that have index entries and also
577 # any files that we determined to be explicit index files
578 # or any index files that we failed to read.
579 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
581 # The filtered list loses the initial order. Retaining the order
582 # is good for testing but does have a cost if there are many
583 # files when copying the good values out. A dict would have faster
584 # lookups (using the files as keys) but use more memory.
585 ordered = [f for f in filtered if f in files]
587 return index_entries, ordered, good_index_files, bad_index_files
589 def processIndexEntries(self, index_entries):
590 """Convert index entries to RawFileData.
592 Parameters
593 ----------
594 index_entries : `dict` [`str`, Any]
595 Dict indexed by name of file to ingest and with keys either
596 raw metadata or translated
597 `~astro_metadata_translator.ObservationInfo`.
599 Returns
600 -------
601 data : `RawFileData`
602 A structure containing the metadata extracted from the file,
603 as well as the original filename. All fields will be populated,
604 but the `RawFileData.dataId` attribute will be a minimal
605 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance.
606 """
607 fileData = []
608 for filename, metadata in index_entries.items():
609 try:
610 datasets = [self._calculate_dataset_info(metadata, filename)]
611 except Exception as e:
612 self.log.debug("Problem extracting metadata for file %s found in index file: %s",
613 filename, e)
614 datasets = []
615 formatterClass = Formatter
616 instrument = None
617 self._on_metadata_failure(filename, e)
618 if self.config.failFast:
619 raise RuntimeError(f"Problem extracting metadata for file {filename} "
620 "found in index file") from e
621 else:
622 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId,
623 filename)
624 if instrument is None:
625 datasets = []
626 fileData.append(RawFileData(datasets=datasets, filename=filename,
627 FormatterClass=formatterClass, instrumentClass=instrument))
628 return fileData
630 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
631 """Group an iterable of `RawFileData` by exposure.
633 Parameters
634 ----------
635 files : iterable of `RawFileData`
636 File-level information to group.
638 Returns
639 -------
640 exposures : `list` of `RawExposureData`
641 A list of structures that group the file-level information by
642 exposure. All fields will be populated. The
643 `RawExposureData.dataId` attributes will be minimal (unexpanded)
644 `~lsst.daf.butler.DataCoordinate` instances.
645 """
646 exposureDimensions = self.universe["exposure"].graph
647 byExposure = defaultdict(list)
648 for f in files:
649 # Assume that the first dataset is representative for the file.
650 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
652 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
653 for dataId, exposureFiles in byExposure.items()]
655 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
656 """Expand the data IDs associated with a raw exposure.
658 This adds the metadata records.
660 Parameters
661 ----------
662 exposure : `RawExposureData`
663 A structure containing information about the exposure to be
664 ingested. Must have `RawExposureData.records` populated. Should
665 be considered consumed upon return.
667 Returns
668 -------
669 exposure : `RawExposureData`
670 An updated version of the input structure, with
671 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
672 updated to data IDs for which
673 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
674 """
675 # We start by expanded the exposure-level data ID; we won't use that
676 # directly in file ingest, but this lets us do some database lookups
677 # once per exposure instead of once per file later.
678 data.dataId = self.butler.registry.expandDataId(
679 data.dataId,
680 # We pass in the records we'll be inserting shortly so they aren't
681 # looked up from the database. We do expect instrument and filter
682 # records to be retrieved from the database here (though the
683 # Registry may cache them so there isn't a lookup every time).
684 records={
685 self.butler.registry.dimensions["exposure"]: data.record,
686 }
687 )
688 # Now we expand the per-file (exposure+detector) data IDs. This time
689 # we pass in the records we just retrieved from the exposure data ID
690 # expansion.
691 for file in data.files:
692 for dataset in file.datasets:
693 dataset.dataId = self.butler.registry.expandDataId(
694 dataset.dataId,
695 records=dict(data.dataId.records)
696 )
697 return data
699 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1
700 ) -> Tuple[Iterator[RawExposureData], List[str]]:
701 """Perform all non-database-updating ingest preprocessing steps.
703 Parameters
704 ----------
705 files : iterable over `str` or path-like objects
706 Paths to the files to be ingested. Will be made absolute
707 if they are not already.
708 pool : `multiprocessing.Pool`, optional
709 If not `None`, a process pool with which to parallelize some
710 operations.
711 processes : `int`, optional
712 The number of processes to use. Ignored if ``pool`` is not `None`.
714 Returns
715 -------
716 exposures : `Iterator` [ `RawExposureData` ]
717 Data structures containing dimension records, filenames, and data
718 IDs to be ingested (one structure for each exposure).
719 bad_files : `list` of `str`
720 List of all the files that could not have metadata extracted.
721 """
722 if pool is None and processes > 1:
723 pool = Pool(processes)
724 mapFunc = map if pool is None else pool.imap_unordered
726 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]:
727 """Filter out bad files and return good with list of bad."""
728 good_files = []
729 bad_files = []
730 for fileDatum in file_data:
731 if not fileDatum.datasets:
732 bad_files.append(fileDatum.filename)
733 else:
734 good_files.append(fileDatum)
735 return good_files, bad_files
737 # Look for index files and read them.
738 # There should be far fewer index files than data files.
739 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
740 if bad_index_files:
741 self.log.info("Failed to read the following explicitly requested index files:"),
742 for bad in sorted(bad_index_files):
743 self.log.info("- %s", bad)
745 # Now convert all the index file entries to standard form for ingest.
746 bad_index_file_data = []
747 indexFileData = self.processIndexEntries(index_entries)
748 if indexFileData:
749 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData)
750 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s"
751 " with %d failure%s",
752 *_log_msg_counter(indexFileData),
753 *_log_msg_counter(good_index_files),
754 *_log_msg_counter(bad_index_file_data))
756 # Extract metadata and build per-detector regions.
757 # This could run in a subprocess so collect all output
758 # before looking at failures.
759 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
761 # Filter out all the failed reads and store them for later
762 # reporting.
763 fileData, bad_files = _partition_good_bad(fileData)
764 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s",
765 *_log_msg_counter(fileData),
766 *_log_msg_counter(bad_files))
768 # Combine with data from index files.
769 fileData.extend(indexFileData)
770 bad_files.extend(bad_index_file_data)
771 bad_files.extend(bad_index_files)
773 # Use that metadata to group files (and extracted metadata) by
774 # exposure. Never parallelized because it's intrinsically a gather
775 # step.
776 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
778 # The next operation operates on RawExposureData instances (one at
779 # a time) in-place and then returns the modified instance. We call it
780 # as a pass-through instead of relying on the arguments we pass in to
781 # have been modified because in the parallel case those arguments are
782 # going to be pickled and unpickled, and I'm not certain
783 # multiprocessing is careful enough with that for output arguments to
784 # work.
786 # Expand the data IDs to include all dimension metadata; we need this
787 # because we may need to generate path templates that rely on that
788 # metadata.
789 # This is the first step that involves actual database calls (but just
790 # SELECTs), so if there's going to be a problem with connections vs.
791 # multiple processes, or lock contention (in SQLite) slowing things
792 # down, it'll happen here.
793 return mapFunc(self.expandDataIds, exposureData), bad_files
795 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
796 ) -> List[FileDataset]:
797 """Ingest all raw files in one exposure.
799 Parameters
800 ----------
801 exposure : `RawExposureData`
802 A structure containing information about the exposure to be
803 ingested. Must have `RawExposureData.records` populated and all
804 data ID attributes expanded.
805 run : `str`, optional
806 Name of a RUN-type collection to write to, overriding
807 ``self.butler.run``.
809 Returns
810 -------
811 datasets : `list` of `lsst.daf.butler.FileDataset`
812 Per-file structures identifying the files ingested and their
813 dataset representation in the data repository.
814 """
815 datasets = [FileDataset(path=os.path.abspath(file.filename),
816 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
817 formatter=file.FormatterClass)
818 for file in exposure.files]
819 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
820 return datasets
822 @timeMethod
823 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None):
824 """Ingest files into a Butler data repository.
826 This creates any new exposure or visit Dimension entries needed to
827 identify the ingested files, creates new Dataset entries in the
828 Registry and finally ingests the files themselves into the Datastore.
829 Any needed instrument, detector, and physical_filter Dimension entries
830 must exist in the Registry before `run` is called.
832 Parameters
833 ----------
834 files : iterable over `str` or path-like objects
835 Paths to the files to be ingested. Will be made absolute
836 if they are not already.
837 pool : `multiprocessing.Pool`, optional
838 If not `None`, a process pool with which to parallelize some
839 operations.
840 processes : `int`, optional
841 The number of processes to use. Ignored if ``pool`` is not `None`.
842 run : `str`, optional
843 Name of a RUN-type collection to write to, overriding
844 the default derived from the instrument name.
846 Returns
847 -------
848 refs : `list` of `lsst.daf.butler.DatasetRef`
849 Dataset references for ingested raws.
851 Notes
852 -----
853 This method inserts all datasets for an exposure within a transaction,
854 guaranteeing that partial exposures are never ingested. The exposure
855 dimension record is inserted with `Registry.syncDimensionData` first
856 (in its own transaction), which inserts only if a record with the same
857 primary key does not already exist. This allows different files within
858 the same exposure to be incremented in different runs.
859 """
860 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
861 # Up to this point, we haven't modified the data repository at all.
862 # Now we finally do that, with one transaction per exposure. This is
863 # not parallelized at present because the performance of this step is
864 # limited by the database server. That may or may not change in the
865 # future once we increase our usage of bulk inserts and reduce our
866 # usage of savepoints; we've tried to get everything but the database
867 # operations done in advance to reduce the time spent inside
868 # transactions.
869 self.butler.registry.registerDatasetType(self.datasetType)
870 refs = []
871 runs = set()
872 n_exposures = 0
873 n_exposures_failed = 0
874 n_ingests_failed = 0
875 for exposure in exposureData:
877 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s",
878 *_log_msg_counter(exposure.files),
879 exposure.record.instrument, exposure.record.obs_id)
881 try:
882 self.butler.registry.syncDimensionData("exposure", exposure.record)
883 except Exception as e:
884 self._on_ingest_failure(exposure, e)
885 n_exposures_failed += 1
886 self.log.warning("Exposure %s:%s could not be registered: %s",
887 exposure.record.instrument, exposure.record.obs_id, e)
888 if self.config.failFast:
889 raise e
890 continue
892 # Override default run if nothing specified explicitly.
893 if run is None:
894 instrumentClass = exposure.files[0].instrumentClass
895 this_run = instrumentClass.makeDefaultRawIngestRunName()
896 else:
897 this_run = run
898 if this_run not in runs:
899 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
900 runs.add(this_run)
901 try:
902 with self.butler.transaction():
903 datasets_for_exposure = self.ingestExposureDatasets(exposure, run=this_run)
904 except Exception as e:
905 self._on_ingest_failure(exposure, e)
906 n_ingests_failed += 1
907 self.log.warning("Failed to ingest the following for reason: %s", e)
908 for f in exposure.files:
909 self.log.warning("- %s", f.filename)
910 if self.config.failFast:
911 raise e
912 continue
913 else:
914 self._on_success(datasets_for_exposure)
915 for dataset in datasets_for_exposure:
916 refs.extend(dataset.refs)
918 # Success for this exposure.
919 n_exposures += 1
920 self.log.info("Exposure %s:%s ingested successfully",
921 exposure.record.instrument, exposure.record.obs_id)
923 had_failure = False
925 if bad_files:
926 had_failure = True
927 self.log.warning("Could not extract observation metadata from the following:")
928 for f in bad_files:
929 self.log.warning("- %s", f)
931 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure"
932 " registration and %d failure%s from file ingest.",
933 *_log_msg_counter(n_exposures),
934 *_log_msg_counter(n_exposures_failed),
935 *_log_msg_counter(n_ingests_failed))
936 if n_exposures_failed > 0 or n_ingests_failed > 0:
937 had_failure = True
938 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
940 if had_failure:
941 raise RuntimeError("Some failures encountered during ingestion")
943 return refs