Coverage for python/lsst/obs/base/ingest.py: 16%
373 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-03 02:54 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-03 02:54 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from collections import defaultdict
28from collections.abc import Callable, Iterable, Iterator, MutableMapping, Sized
29from dataclasses import InitVar, dataclass
30from multiprocessing import Pool
31from typing import Any, ClassVar
33from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers
34from astro_metadata_translator.indexing import process_index_data, process_sidecar_data
35from lsst.afw.fits import readMetadata
36from lsst.daf.butler import (
37 Butler,
38 CollectionType,
39 DataCoordinate,
40 DatasetIdGenEnum,
41 DatasetRef,
42 DatasetType,
43 DimensionRecord,
44 DimensionUniverse,
45 FileDataset,
46 Formatter,
47 Progress,
48 Timespan,
49)
50from lsst.pex.config import ChoiceField, Config, Field
51from lsst.pipe.base import Instrument, Task
52from lsst.resources import ResourcePath, ResourcePathExpression
53from lsst.utils.timer import timeMethod
55from ._instrument import makeExposureRecordFromObsInfo
57# multiprocessing.Pool is actually a function, not a type, and the real type
58# isn't exposed, so we can't used it annotations, so we'll just punt on it via
59# this alias instead.
60PoolType = Any
63def _do_nothing(*args: Any, **kwargs: Any) -> None:
64 """Do nothing.
66 This is a function that accepts anything and does nothing.
67 For use as a default in callback arguments.
68 """
69 pass
72def _log_msg_counter(noun: int | Sized) -> tuple[int, str]:
73 """Count the iterable and return the count and plural modifier.
75 Parameters
76 ----------
77 noun : `Sized` or `int`
78 Thing to count. If given an integer it is assumed to be the count
79 to use to calculate modifier.
81 Returns
82 -------
83 num : `int`
84 Number of items found in ``noun``.
85 modifier : `str`
86 Character to add to the end of a string referring to these items
87 to indicate whether it was a single item or not. Returns empty
88 string if there is one item or "s" otherwise.
90 Examples
91 --------
92 .. code-block:: python
94 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
95 """
96 if isinstance(noun, int):
97 num = noun
98 else:
99 num = len(noun)
100 return num, "" if num == 1 else "s"
103@dataclass
104class RawFileDatasetInfo:
105 """Information about a single dataset within a raw file."""
107 dataId: DataCoordinate
108 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
110 obsInfo: ObservationInfo
111 """Standardized observation metadata extracted directly from the file
112 headers (`astro_metadata_translator.ObservationInfo`).
113 """
116@dataclass
117class RawFileData:
118 """Information about a single raw file, used during ingest."""
120 datasets: list[RawFileDatasetInfo]
121 """The information describing each dataset within this raw file.
122 (`list` of `RawFileDatasetInfo`)
123 """
125 filename: ResourcePath
126 """URI of the file this information was extracted from (`str`).
128 This is the path prior to ingest, not the path after ingest.
129 """
131 FormatterClass: type[Formatter]
132 """Formatter class that should be used to ingest this file (`type`; as
133 subclass of `~lsst.daf.butler.Formatter`).
134 """
136 instrument: Instrument | None
137 """The `Instrument` instance associated with this file. Can be `None`
138 if ``datasets`` is an empty list."""
141@dataclass
142class RawExposureData:
143 """Information about a complete raw exposure, used during ingest."""
145 dataId: DataCoordinate
146 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
147 """
149 files: list[RawFileData]
150 """List of structures containing file-level information.
151 """
153 universe: InitVar[DimensionUniverse]
154 """Set of all known dimensions.
155 """
157 record: DimensionRecord
158 """The exposure `DimensionRecord` that must be inserted into the
159 `~lsst.daf.butler.Registry` prior to file-level ingest
160 (`~lsst.daf.butler.DimensionRecord`).
161 """
163 dependencyRecords: dict[str, DimensionRecord]
164 """Additional records that must be inserted into the
165 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record``
166 (e.g., to satisfy foreign key constraints), indexed by the dimension name.
167 """
170def makeTransferChoiceField(
171 doc: str = "How to transfer files (None for no transfer).", default: str = "auto"
172) -> ChoiceField:
173 """Create a Config field with options for transferring data between repos.
175 The allowed options for the field are exactly those supported by
176 `lsst.daf.butler.Datastore.ingest`.
178 Parameters
179 ----------
180 doc : `str`
181 Documentation for the configuration field.
182 default : `str`, optional
183 Default transfer mode for the field.
185 Returns
186 -------
187 field : `lsst.pex.config.ChoiceField`
188 Configuration field.
189 """
190 return ChoiceField(
191 doc=doc,
192 dtype=str,
193 allowed={
194 "move": "move",
195 "copy": "copy",
196 "auto": "choice will depend on datastore",
197 "direct": "use URI to ingested file directly in datastore",
198 "link": "hard link falling back to symbolic link",
199 "hardlink": "hard link",
200 "symlink": "symbolic (soft) link",
201 "relsymlink": "relative symbolic link",
202 },
203 optional=True,
204 default=default,
205 )
208class RawIngestConfig(Config):
209 """Configuration class for RawIngestTask."""
211 transfer = makeTransferChoiceField()
212 failFast: Field[bool] = Field(
213 dtype=bool,
214 default=False,
215 doc="If True, stop ingest as soon as any problem is encountered with any file. "
216 "Otherwise problem files will be skipped and logged and a report issued at completion.",
217 )
220class RawIngestTask(Task):
221 """Driver Task for ingesting raw data into Gen3 Butler repositories.
223 Parameters
224 ----------
225 config : `RawIngestConfig`
226 Configuration for the task.
227 butler : `~lsst.daf.butler.Butler`
228 Writeable butler instance, with ``butler.run`` set to the appropriate
229 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
230 datasets.
231 on_success : `Callable`, optional
232 A callback invoked when all of the raws associated with an exposure
233 are ingested. Will be passed a list of `FileDataset` objects, each
234 containing one or more resolved `DatasetRef` objects. If this callback
235 raises it will interrupt the entire ingest process, even if
236 `RawIngestConfig.failFast` is `False`.
237 on_metadata_failure : `Callable`, optional
238 A callback invoked when a failure occurs trying to translate the
239 metadata for a file. Will be passed the URI and the exception, in
240 that order, as positional arguments. Guaranteed to be called in an
241 ``except`` block, allowing the callback to re-raise or replace (with
242 ``raise ... from``) to override the task's usual error handling (before
243 `RawIngestConfig.failFast` logic occurs).
244 on_ingest_failure : `Callable`, optional
245 A callback invoked when dimension record or dataset insertion into the
246 database fails for an exposure. Will be passed a `RawExposureData`
247 instance and the exception, in that order, as positional arguments.
248 Guaranteed to be called in an ``except`` block, allowing the callback
249 to re-raise or replace (with ``raise ... from``) to override the task's
250 usual error handling (before `RawIngestConfig.failFast` logic occurs).
251 **kwargs
252 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
253 constructor.
255 Notes
256 -----
257 Each instance of `RawIngestTask` writes to the same Butler. Each
258 invocation of `RawIngestTask.run` ingests a list of files.
259 """
261 ConfigClass: ClassVar[type[Config]] = RawIngestConfig
263 _DefaultName: ClassVar[str] = "ingest"
265 def getDatasetType(self) -> DatasetType:
266 """Return the default DatasetType of the datasets ingested by this
267 Task.
269 Returns
270 -------
271 datasetType : `DatasetType`
272 The default dataset type to use for the data being ingested. This
273 is only used if the relevant `~lsst.pipe.base.Instrument` does not
274 define an override.
275 """
276 return DatasetType(
277 "raw",
278 ("instrument", "detector", "exposure"),
279 "Exposure",
280 universe=self.butler.dimensions,
281 )
283 # Mypy can not determine that the config passed to super() is this type.
284 config: RawIngestConfig
286 def __init__(
287 self,
288 config: RawIngestConfig,
289 *,
290 butler: Butler,
291 on_success: Callable[[list[FileDataset]], Any] = _do_nothing,
292 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing,
293 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
294 **kwargs: Any,
295 ):
296 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
297 super().__init__(config, **kwargs)
298 self.butler = butler
299 self.universe = self.butler.dimensions
300 self.datasetType = self.getDatasetType()
301 self._on_success = on_success
302 self._on_metadata_failure = on_metadata_failure
303 self._on_ingest_failure = on_ingest_failure
304 self.progress = Progress("obs.base.RawIngestTask")
306 # Import all the instrument classes so that we ensure that we
307 # have all the relevant metadata translators loaded.
308 Instrument.importAll(self.butler.registry)
310 # Read all the instrument records into a cache since they will be
311 # needed later to calculate day_obs timespans, if appropriate.
312 self._instrument_records = {
313 rec.name: rec for rec in butler.registry.queryDimensionRecords("instrument")
314 }
316 def _reduce_kwargs(self) -> dict[str, Any]:
317 # Add extra parameters to pickle.
318 return dict(
319 **super()._reduce_kwargs(),
320 butler=self.butler,
321 on_success=self._on_success,
322 on_metadata_failure=self._on_metadata_failure,
323 on_ingest_failure=self._on_ingest_failure,
324 )
326 def _determine_instrument_formatter(
327 self, dataId: DataCoordinate, filename: ResourcePath
328 ) -> tuple[Instrument | None, type[Formatter]]:
329 """Determine the instrument and formatter class.
331 Parameters
332 ----------
333 dataId : `lsst.daf.butler.DataCoordinate`
334 The dataId associated with this dataset.
335 filename : `lsst.resources.ResourcePath`
336 URI of file used for error reporting.
338 Returns
339 -------
340 instrument : `Instrument` or `None`
341 Instance of the `Instrument` associated with this dataset. `None`
342 indicates that the instrument could not be determined.
343 formatterClass : `type`
344 Class to be used as the formatter for this dataset.
345 """
346 # The data model currently assumes that whilst multiple datasets
347 # can be associated with a single file, they must all share the
348 # same formatter.
349 try:
350 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore
351 except LookupError as e:
352 self._on_metadata_failure(filename, e)
353 self.log.warning(
354 "Instrument %s for file %s not known to registry", dataId["instrument"], filename
355 )
356 if self.config.failFast:
357 raise RuntimeError(
358 f"Instrument {dataId['instrument']} for file {filename} not known to registry"
359 ) from e
360 FormatterClass = Formatter
361 # Indicate that we could not work out the instrument.
362 instrument = None
363 else:
364 assert instrument is not None, "Should be guaranted by fromName succeeding."
365 FormatterClass = instrument.getRawFormatter(dataId)
366 return instrument, FormatterClass
368 def extractMetadata(self, filename: ResourcePath) -> RawFileData:
369 """Extract and process metadata from a single raw file.
371 Parameters
372 ----------
373 filename : `lsst.resources.ResourcePath`
374 URI to the file.
376 Returns
377 -------
378 data : `RawFileData`
379 A structure containing the metadata extracted from the file,
380 as well as the original filename. All fields will be populated,
381 but the `RawFileData.dataId` attribute will be a minimal
382 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
383 ``instrument`` field will be `None` if there is a problem
384 with metadata extraction.
386 Notes
387 -----
388 Assumes that there is a single dataset associated with the given
389 file. Instruments using a single file to store multiple datasets
390 must implement their own version of this method.
392 By default the method will catch all exceptions unless the ``failFast``
393 configuration item is `True`. If an error is encountered the
394 `_on_metadata_failure()` method will be called. If no exceptions
395 result and an error was encountered the returned object will have
396 a null-instrument class and no datasets.
398 This method supports sidecar JSON files which can be used to
399 extract metadata without having to read the data file itself.
400 The sidecar file is always used if found.
401 """
402 sidecar_fail_msg = "" # Requires prepended space when set.
403 try:
404 sidecar_file = filename.updatedExtension(".json")
405 if sidecar_file.exists():
406 content = json.loads(sidecar_file.read())
407 headers = [process_sidecar_data(content)]
408 sidecar_fail_msg = " (via sidecar)"
409 else:
410 # Read the metadata from the data file itself.
412 # For remote files download the entire file to get the
413 # header. This is very inefficient and it would be better
414 # to have some way of knowing where in the file the headers
415 # are and to only download those parts of the file.
416 with filename.as_local() as local_file:
417 # Read the primary. This might be sufficient.
418 header = readMetadata(local_file.ospath, 0)
419 translator_class = None
421 try:
422 # Try to work out a translator class early.
423 translator_class = MetadataTranslator.determine_translator(
424 header, filename=str(filename)
425 )
426 except ValueError:
427 # Primary header was not sufficient (maybe this file
428 # has been compressed or is a MEF with minimal
429 # primary). Read second header and merge with primary.
430 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
432 # Try again to work out a translator class, letting this
433 # fail.
434 if translator_class is None:
435 translator_class = MetadataTranslator.determine_translator(
436 header, filename=str(filename)
437 )
439 # Request the headers to use for ingest
440 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header))
442 # Add each header to the dataset list
443 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
445 except Exception as e:
446 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
447 # Indicate to the caller that we failed to read.
448 datasets = []
449 formatterClass = Formatter
450 instrument = None
451 self._on_metadata_failure(filename, e)
452 if self.config.failFast:
453 raise RuntimeError(
454 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}"
455 ) from e
456 else:
457 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
458 # The data model currently assumes that whilst multiple datasets
459 # can be associated with a single file, they must all share the
460 # same formatter.
461 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
462 if instrument is None:
463 datasets = []
465 return RawFileData(
466 datasets=datasets,
467 filename=filename,
468 # MyPy wants this to be a non-abstract class, which is not true
469 # for the error case where instrument is None and datasets=[].
470 FormatterClass=formatterClass, # type: ignore
471 instrument=instrument,
472 )
474 @classmethod
475 def getObservationInfoSubsets(cls) -> tuple[set, set]:
476 """Return subsets of fields in the `ObservationInfo` that we care
477 about.
479 These fields will be used in constructing an exposure record.
481 Returns
482 -------
483 required : `set`
484 Set of `ObservationInfo` field names that are required.
485 optional : `set`
486 Set of `ObservationInfo` field names we will use if they are
487 available.
488 """
489 # Marking the new properties "group_counter_*" and
490 # "has_simulated_content" as required, assumes that we either
491 # recreate any existing index/sidecar files that include translated
492 # values, or else allow astro_metadata_translator to fill in
493 # defaults.
494 required = {
495 "datetime_begin",
496 "datetime_end",
497 "detector_num",
498 "exposure_group",
499 "exposure_id",
500 "exposure_time",
501 "group_counter_end",
502 "group_counter_start",
503 "has_simulated_content",
504 "instrument",
505 "observation_id",
506 "observation_type",
507 "observing_day",
508 "physical_filter",
509 }
510 optional = {
511 "altaz_begin",
512 "boresight_rotation_coord",
513 "boresight_rotation_angle",
514 "dark_time",
515 "tracking_radec",
516 "object",
517 "observation_counter",
518 "observation_reason",
519 "observing_day_offset",
520 "science_program",
521 "visit_id",
522 "can_see_sky",
523 }
524 return required, optional
526 def _calculate_dataset_info(
527 self, header: MutableMapping[str, Any] | ObservationInfo, filename: ResourcePath
528 ) -> RawFileDatasetInfo:
529 """Calculate a RawFileDatasetInfo from the supplied information.
531 Parameters
532 ----------
533 header : Mapping or `astro_metadata_translator.ObservationInfo`
534 Header from the dataset or previously-translated content.
535 filename : `lsst.resources.ResourcePath`
536 Filename to use for error messages.
538 Returns
539 -------
540 dataset : `RawFileDatasetInfo`
541 The dataId, and observation information associated with this
542 dataset.
543 """
544 required, optional = self.getObservationInfoSubsets()
545 if isinstance(header, ObservationInfo):
546 obsInfo = header
547 missing = []
548 # Need to check the required properties are present.
549 for property in required:
550 # getattr does not need to be protected because it is using
551 # the defined list above containing properties that must exist.
552 value = getattr(obsInfo, property)
553 if value is None:
554 missing.append(property)
555 if missing:
556 raise ValueError(
557 f"Requested required properties are missing from file {filename}: {missing} (via JSON)"
558 )
560 else:
561 obsInfo = ObservationInfo(
562 header,
563 pedantic=False,
564 filename=str(filename),
565 required=required,
566 subset=required | optional,
567 )
569 dataId = DataCoordinate.standardize(
570 instrument=obsInfo.instrument,
571 exposure=obsInfo.exposure_id,
572 detector=obsInfo.detector_num,
573 universe=self.universe,
574 )
575 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
577 def locateAndReadIndexFiles(
578 self, files: Iterable[ResourcePath]
579 ) -> tuple[dict[ResourcePath, Any], list[ResourcePath], set[ResourcePath], set[ResourcePath]]:
580 """Given a list of files, look for index files and read them.
582 Index files can either be explicitly in the list of files to
583 ingest, or else located in the same directory as a file to ingest.
584 Index entries are always used if present.
586 Parameters
587 ----------
588 files : iterable over `lsst.resources.ResourcePath`
589 URIs to the files to be ingested.
591 Returns
592 -------
593 index : `dict` [`ResourcePath`, Any]
594 Merged contents of all relevant index files found. These can
595 be explicitly specified index files or ones found in the
596 directory alongside a data file to be ingested.
597 updated_files : `list` of `ResourcePath`
598 Updated list of the input files with entries removed that were
599 found listed in an index file. Order is not guaranteed to
600 match the order of the files given to this routine.
601 good_index_files: `set` [ `ResourcePath` ]
602 Index files that were successfully read.
603 bad_index_files: `set` [ `ResourcePath` ]
604 Files that looked like index files but failed to read properly.
605 """
606 # Convert the paths to absolute for easy comparison with index content.
607 # Do not convert to real paths since we have to assume that index
608 # files are in this location and not the location which it links to.
609 files = tuple(f.abspath() for f in files)
611 # Index files must be named this.
612 index_root_file = "_index.json"
614 # Group the files by directory.
615 files_by_directory = defaultdict(set)
617 for path in files:
618 directory, file_in_dir = path.split()
619 files_by_directory[directory].add(file_in_dir)
621 # All the metadata read from index files with keys of full path.
622 index_entries: dict[ResourcePath, Any] = {}
624 # Index files we failed to read.
625 bad_index_files = set()
627 # Any good index files that were found and used.
628 good_index_files = set()
630 # Look for index files in those directories.
631 for directory, files_in_directory in files_by_directory.items():
632 possible_index_file = directory.join(index_root_file)
633 if possible_index_file.exists():
634 # If we are explicitly requesting an index file the
635 # messages should be different.
636 index_msg = "inferred"
637 is_implied = True
638 if index_root_file in files_in_directory:
639 index_msg = "explicit"
640 is_implied = False
642 # Try to read the index file and catch and report any
643 # problems.
644 try:
645 content = json.loads(possible_index_file.read())
646 index = process_index_data(content, force_dict=True)
647 # mypy should in theory know that this is a mapping
648 # from the overload type annotation of process_index_data.
649 assert isinstance(index, MutableMapping)
650 except Exception as e:
651 # Only trigger the callback if the index file
652 # was asked for explicitly. Triggering on implied file
653 # might be surprising.
654 if not is_implied:
655 self._on_metadata_failure(possible_index_file, e)
656 if self.config.failFast:
657 raise RuntimeError(
658 f"Problem reading index file from {index_msg} location {possible_index_file}"
659 ) from e
660 bad_index_files.add(possible_index_file)
661 continue
663 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
664 good_index_files.add(possible_index_file)
666 # Go through the index adding entries for files.
667 # If we have non-index files in this directory marked for
668 # ingest we should only get index information for those.
669 # If the index file was explicit we use all entries.
670 if is_implied:
671 files_to_ingest = files_in_directory
672 else:
673 files_to_ingest = set(index)
675 # Copy relevant metadata into a single dict for all index
676 # entries.
677 for file_in_dir in files_to_ingest:
678 # Skip an explicitly specified index file.
679 # This should never happen because an explicit index
680 # file will force ingest of all files in the index
681 # and not use the explicit file list. If somehow
682 # this is not true we continue. Raising an exception
683 # seems like the wrong thing to do since this is harmless.
684 if file_in_dir == index_root_file:
685 self.log.info(
686 "Logic error found scanning directory %s. Please file ticket.", directory
687 )
688 continue
689 if file_in_dir in index:
690 file = directory.join(file_in_dir)
691 if file in index_entries:
692 # ObservationInfo overrides raw metadata
693 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance(
694 index_entries[file], ObservationInfo
695 ):
696 self.log.warning(
697 "File %s already specified in an index file but overriding"
698 " with ObservationInfo content from %s",
699 file,
700 possible_index_file,
701 )
702 else:
703 self.log.warning(
704 "File %s already specified in an index file, ignoring content from %s",
705 file,
706 possible_index_file,
707 )
708 # Do nothing in this case
709 continue
711 index_entries[file] = index[file_in_dir]
713 # Remove files from list that have index entries and also
714 # any files that we determined to be explicit index files
715 # or any index files that we failed to read.
716 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
718 # The filtered list loses the initial order. Retaining the order
719 # is good for testing but does have a cost if there are many
720 # files when copying the good values out. A dict would have faster
721 # lookups (using the files as keys) but use more memory.
722 ordered = [f for f in filtered if f in files]
724 return index_entries, ordered, good_index_files, bad_index_files
726 def processIndexEntries(self, index_entries: dict[ResourcePath, Any]) -> list[RawFileData]:
727 """Convert index entries to RawFileData.
729 Parameters
730 ----------
731 index_entries : `dict` [`ResourcePath`, Any]
732 Dict indexed by name of file to ingest and with keys either
733 raw metadata or translated
734 `~astro_metadata_translator.ObservationInfo`.
736 Returns
737 -------
738 data : `list` [ `RawFileData` ]
739 Structures containing the metadata extracted from the file,
740 as well as the original filename. All fields will be populated,
741 but the `RawFileData.dataId` attributes will be minimal
742 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances.
743 """
744 fileData = []
745 for filename, metadata in index_entries.items():
746 try:
747 datasets = [self._calculate_dataset_info(metadata, filename)]
748 except Exception as e:
749 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e)
750 datasets = []
751 formatterClass = Formatter
752 instrument = None
753 self._on_metadata_failure(filename, e)
754 if self.config.failFast:
755 raise RuntimeError(
756 f"Problem extracting metadata for file {filename} found in index file"
757 ) from e
758 else:
759 instrument, formatterClass = self._determine_instrument_formatter(
760 datasets[0].dataId, filename
761 )
762 if instrument is None:
763 datasets = []
764 fileData.append(
765 RawFileData(
766 datasets=datasets,
767 filename=filename,
768 # MyPy wants this to be a non-abstract class, which is not
769 # true for the error case where instrument is None and
770 # datasets=[].
771 FormatterClass=formatterClass, # type: ignore
772 instrument=instrument,
773 )
774 )
775 return fileData
777 def groupByExposure(self, files: Iterable[RawFileData]) -> list[RawExposureData]:
778 """Group an iterable of `RawFileData` by exposure.
780 Parameters
781 ----------
782 files : iterable of `RawFileData`
783 File-level information to group.
785 Returns
786 -------
787 exposures : `list` of `RawExposureData`
788 A list of structures that group the file-level information by
789 exposure. All fields will be populated. The
790 `RawExposureData.dataId` attributes will be minimal (unexpanded)
791 `~lsst.daf.butler.DataCoordinate` instances.
792 """
793 exposureDimensions = self.universe["exposure"].graph
794 byExposure = defaultdict(list)
795 for f in files:
796 # Assume that the first dataset is representative for the file.
797 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
799 return [
800 RawExposureData(
801 dataId=dataId,
802 files=exposureFiles,
803 universe=self.universe,
804 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe),
805 dependencyRecords=self.makeDependencyRecords(
806 exposureFiles[0].datasets[0].obsInfo, self.universe
807 ),
808 )
809 for dataId, exposureFiles in byExposure.items()
810 ]
812 def makeExposureRecord(
813 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any
814 ) -> DimensionRecord:
815 """Construct a registry record for an exposure.
817 This is a method that subclasses will often want to customize. This can
818 often be done by calling this base class implementation with additional
819 ``kwargs``.
821 Parameters
822 ----------
823 obsInfo : `ObservationInfo`
824 Observation details for (one of the components of) the exposure.
825 universe : `DimensionUniverse`
826 Set of all known dimensions.
827 **kwargs
828 Additional field values for this record.
830 Returns
831 -------
832 record : `DimensionRecord`
833 The exposure record that must be inserted into the
834 `~lsst.daf.butler.Registry` prior to file-level ingest.
835 """
836 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs)
838 def makeDependencyRecords(
839 self, obsInfo: ObservationInfo, universe: DimensionUniverse
840 ) -> dict[str, DimensionRecord]:
841 """Construct dependency records.
843 These dependency records will be inserted into the
844 `~lsst.daf.butler.Registry` before the exposure records, because they
845 are dependencies of the exposure. This allows an opportunity to satisfy
846 foreign key constraints that exist because of dimensions related to the
847 exposure.
849 This is a method that subclasses may want to customize, if they've
850 added dimensions that relate to an exposure.
852 Parameters
853 ----------
854 obsInfo : `ObservationInfo`
855 Observation details for (one of the components of) the exposure.
856 universe : `DimensionUniverse`
857 Set of all known dimensions.
859 Returns
860 -------
861 records : `dict` [`str`, `DimensionRecord`]
862 The records to insert, indexed by dimension name.
863 """
864 records: dict[str, DimensionRecord] = {}
865 if "exposure" not in universe:
866 return records
867 exposure = universe["exposure"]
868 if "group" in exposure.implied:
869 records["group"] = universe["group"].RecordClass(
870 name=obsInfo.exposure_group,
871 instrument=obsInfo.instrument,
872 )
873 if "day_obs" in exposure.implied:
874 if (offset := getattr(obsInfo, "observing_day_offset")) is not None:
875 offset_int = round(offset.to_value("s"))
876 timespan = Timespan.from_day_obs(obsInfo.observing_day, offset_int)
877 else:
878 timespan = None
879 records["day_obs"] = universe["day_obs"].RecordClass(
880 instrument=obsInfo.instrument,
881 id=obsInfo.observing_day,
882 timespan=timespan,
883 )
884 return records
886 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
887 """Expand the data IDs associated with a raw exposure.
889 This adds the metadata records.
891 Parameters
892 ----------
893 exposure : `RawExposureData`
894 A structure containing information about the exposure to be
895 ingested. Must have `RawExposureData.record` populated. Should
896 be considered consumed upon return.
898 Returns
899 -------
900 exposure : `RawExposureData`
901 An updated version of the input structure, with
902 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
903 updated to data IDs for which
904 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
905 """
906 # We start by expanded the exposure-level data ID; we won't use that
907 # directly in file ingest, but this lets us do some database lookups
908 # once per exposure instead of once per file later.
909 data.dataId = self.butler.registry.expandDataId(
910 data.dataId,
911 # We pass in the records we'll be inserting shortly so they aren't
912 # looked up from the database. We do expect instrument and filter
913 # records to be retrieved from the database here (though the
914 # Registry may cache them so there isn't a lookup every time).
915 records={"exposure": data.record},
916 )
917 # Now we expand the per-file (exposure+detector) data IDs. This time
918 # we pass in the records we just retrieved from the exposure data ID
919 # expansion.
920 for file in data.files:
921 for dataset in file.datasets:
922 dataset.dataId = self.butler.registry.expandDataId(
923 dataset.dataId,
924 records={k: data.dataId.records[k] for k in data.dataId.dimensions.elements},
925 )
926 return data
928 def prep(
929 self, files: Iterable[ResourcePath], *, pool: PoolType | None = None
930 ) -> tuple[Iterator[RawExposureData], list[ResourcePath]]:
931 """Perform all non-database-updating ingest preprocessing steps.
933 Parameters
934 ----------
935 files : iterable over `str` or path-like objects
936 Paths to the files to be ingested. Will be made absolute
937 if they are not already.
938 pool : `multiprocessing.Pool`, optional
939 If not `None`, a process pool with which to parallelize some
940 operations.
942 Returns
943 -------
944 exposures : `Iterator` [ `RawExposureData` ]
945 Data structures containing dimension records, filenames, and data
946 IDs to be ingested (one structure for each exposure).
947 bad_files : `list` of `str`
948 List of all the files that could not have metadata extracted.
949 """
950 mapFunc = map if pool is None else pool.imap_unordered
952 def _partition_good_bad(
953 file_data: Iterable[RawFileData],
954 ) -> tuple[list[RawFileData], list[ResourcePath]]:
955 """Filter out bad files and return good with list of bad."""
956 good_files = []
957 bad_files = []
958 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"):
959 if not fileDatum.datasets:
960 bad_files.append(fileDatum.filename)
961 else:
962 good_files.append(fileDatum)
963 return good_files, bad_files
965 # Look for index files and read them.
966 # There should be far fewer index files than data files.
967 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
968 if bad_index_files:
969 self.log.info("Failed to read the following explicitly requested index files:")
970 for bad in sorted(bad_index_files):
971 self.log.info("- %s", bad)
973 # Now convert all the index file entries to standard form for ingest.
974 processed_bad_index_files: list[ResourcePath] = []
975 indexFileData = self.processIndexEntries(index_entries)
976 if indexFileData:
977 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData)
978 self.log.info(
979 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s",
980 *_log_msg_counter(indexFileData),
981 *_log_msg_counter(good_index_files),
982 *_log_msg_counter(processed_bad_index_files),
983 )
985 # Extract metadata and build per-detector regions.
986 # This could run in a subprocess so collect all output
987 # before looking at failures.
988 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
990 # Filter out all the failed reads and store them for later
991 # reporting.
992 good_file_data, bad_files = _partition_good_bad(fileData)
993 self.log.info(
994 "Successfully extracted metadata from %d file%s with %d failure%s",
995 *_log_msg_counter(good_file_data),
996 *_log_msg_counter(bad_files),
997 )
999 # Combine with data from index files.
1000 good_file_data.extend(indexFileData)
1001 bad_files.extend(processed_bad_index_files)
1002 bad_files.extend(bad_index_files)
1004 # Use that metadata to group files (and extracted metadata) by
1005 # exposure. Never parallelized because it's intrinsically a gather
1006 # step.
1007 exposureData: list[RawExposureData] = self.groupByExposure(good_file_data)
1009 # The next operation operates on RawExposureData instances (one at
1010 # a time) in-place and then returns the modified instance. We call it
1011 # as a pass-through instead of relying on the arguments we pass in to
1012 # have been modified because in the parallel case those arguments are
1013 # going to be pickled and unpickled, and I'm not certain
1014 # multiprocessing is careful enough with that for output arguments to
1015 # work.
1017 # Expand the data IDs to include all dimension metadata; we need this
1018 # because we may need to generate path templates that rely on that
1019 # metadata.
1020 # This is the first step that involves actual database calls (but just
1021 # SELECTs), so if there's going to be a problem with connections vs.
1022 # multiple processes, or lock contention (in SQLite) slowing things
1023 # down, it'll happen here.
1024 return mapFunc(self.expandDataIds, exposureData), bad_files
1026 def ingestExposureDatasets(
1027 self,
1028 exposure: RawExposureData,
1029 datasetType: DatasetType,
1030 *,
1031 run: str,
1032 skip_existing_exposures: bool = False,
1033 track_file_attrs: bool = True,
1034 ) -> list[FileDataset]:
1035 """Ingest all raw files in one exposure.
1037 Parameters
1038 ----------
1039 exposure : `RawExposureData`
1040 A structure containing information about the exposure to be
1041 ingested. Must have `RawExposureData.records` populated and all
1042 data ID attributes expanded.
1043 datasetType : `DatasetType`
1044 The dataset type associated with this exposure.
1045 run : `str`
1046 Name of a RUN-type collection to write to.
1047 skip_existing_exposures : `bool`, optional
1048 If `True` (`False` is default), skip raws that have already been
1049 ingested (i.e. raws for which we already have a dataset with the
1050 same data ID in the target collection, even if from another file).
1051 Note that this is much slower than just not passing
1052 already-ingested files as inputs, because we still need to read and
1053 process metadata to identify which exposures to search for. It
1054 also will not work reliably if multiple processes are attempting to
1055 ingest raws from the same exposure concurrently, in that different
1056 processes may still attempt to ingest the same raw and conflict,
1057 causing a failure that prevents other raws from the same exposure
1058 from being ingested.
1059 track_file_attrs : `bool`, optional
1060 Control whether file attributes such as the size or checksum should
1061 be tracked by the datastore. Whether this parameter is honored
1062 depends on the specific datastore implementation.
1064 Returns
1065 -------
1066 datasets : `list` of `lsst.daf.butler.FileDataset`
1067 Per-file structures identifying the files ingested and their
1068 dataset representation in the data repository.
1069 """
1070 if skip_existing_exposures:
1071 existing = {
1072 ref.dataId
1073 for ref in self.butler.registry.queryDatasets(
1074 datasetType,
1075 collections=[run],
1076 dataId=exposure.dataId,
1077 )
1078 }
1079 else:
1080 existing = set()
1082 # Raw files are preferentially ingested using a UUID derived from
1083 # the collection name and dataId.
1084 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN):
1085 mode = DatasetIdGenEnum.DATAID_TYPE_RUN
1086 else:
1087 mode = DatasetIdGenEnum.UNIQUE
1089 datasets = []
1090 for file in exposure.files:
1091 refs = [
1092 DatasetRef(datasetType, d.dataId, run=run, id_generation_mode=mode)
1093 for d in file.datasets
1094 if d.dataId not in existing
1095 ]
1096 if refs:
1097 datasets.append(
1098 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass)
1099 )
1101 self.butler.ingest(
1102 *datasets,
1103 transfer=self.config.transfer,
1104 record_validation_info=track_file_attrs,
1105 )
1106 return datasets
1108 def ingestFiles(
1109 self,
1110 files: Iterable[ResourcePath],
1111 *,
1112 pool: PoolType | None = None,
1113 processes: int = 1,
1114 run: str | None = None,
1115 skip_existing_exposures: bool = False,
1116 update_exposure_records: bool = False,
1117 track_file_attrs: bool = True,
1118 ) -> tuple[list[DatasetRef], list[ResourcePath], int, int, int]:
1119 """Ingest files into a Butler data repository.
1121 This creates any new exposure or visit Dimension entries needed to
1122 identify the ingested files, creates new Dataset entries in the
1123 Registry and finally ingests the files themselves into the Datastore.
1124 Any needed instrument, detector, and physical_filter Dimension entries
1125 must exist in the Registry before `run` is called.
1127 Parameters
1128 ----------
1129 files : iterable over `lsst.resources.ResourcePath`
1130 URIs to the files to be ingested.
1131 pool : `multiprocessing.Pool`, optional
1132 If not `None`, a process pool with which to parallelize some
1133 operations.
1134 processes : `int`, optional
1135 The number of processes to use. Ignored if ``pool`` is not `None`.
1136 run : `str`, optional
1137 Name of a RUN-type collection to write to, overriding
1138 the default derived from the instrument name.
1139 skip_existing_exposures : `bool`, optional
1140 If `True` (`False` is default), skip raws that have already been
1141 ingested (i.e. raws for which we already have a dataset with the
1142 same data ID in the target collection, even if from another file).
1143 Note that this is much slower than just not passing
1144 already-ingested files as inputs, because we still need to read and
1145 process metadata to identify which exposures to search for. It
1146 also will not work reliably if multiple processes are attempting to
1147 ingest raws from the same exposure concurrently, in that different
1148 processes may still attempt to ingest the same raw and conflict,
1149 causing a failure that prevents other raws from the same exposure
1150 from being ingested.
1151 update_exposure_records : `bool`, optional
1152 If `True` (`False` is default), update existing exposure records
1153 that conflict with the new ones instead of rejecting them. THIS IS
1154 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1155 KNOWN TO BE BAD. This should usually be combined with
1156 ``skip_existing_exposures=True``.
1157 track_file_attrs : `bool`, optional
1158 Control whether file attributes such as the size or checksum should
1159 be tracked by the datastore. Whether this parameter is honored
1160 depends on the specific datastore implementation.
1162 Returns
1163 -------
1164 refs : `list` of `lsst.daf.butler.DatasetRef`
1165 Dataset references for ingested raws.
1166 bad_files : `list` of `ResourcePath`
1167 Given paths that could not be ingested.
1168 n_exposures : `int`
1169 Number of exposures successfully ingested.
1170 n_exposures_failed : `int`
1171 Number of exposures that failed when inserting dimension data.
1172 n_ingests_failed : `int`
1173 Number of exposures that failed when ingesting raw datasets.
1174 """
1175 created_pool = False
1176 if pool is None and processes > 1:
1177 pool = Pool(processes)
1178 created_pool = True
1180 try:
1181 exposureData, bad_files = self.prep(files, pool=pool)
1182 finally:
1183 if created_pool and pool:
1184 # The pool is not needed any more so close it if we created
1185 # it to ensure we clean up resources.
1186 pool.close()
1187 pool.join()
1189 # Up to this point, we haven't modified the data repository at all.
1190 # Now we finally do that, with one transaction per exposure. This is
1191 # not parallelized at present because the performance of this step is
1192 # limited by the database server. That may or may not change in the
1193 # future once we increase our usage of bulk inserts and reduce our
1194 # usage of savepoints; we've tried to get everything but the database
1195 # operations done in advance to reduce the time spent inside
1196 # transactions.
1197 refs = []
1198 runs = set()
1199 datasetTypes: dict[str, DatasetType] = {}
1200 n_exposures = 0
1201 n_exposures_failed = 0
1202 n_ingests_failed = 0
1203 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
1204 assert exposure.record is not None, "Should be guaranteed by prep()"
1205 self.log.debug(
1206 "Attempting to ingest %d file%s from exposure %s:%s",
1207 *_log_msg_counter(exposure.files),
1208 exposure.record.instrument,
1209 exposure.record.obs_id,
1210 )
1212 try:
1213 for name, record in exposure.dependencyRecords.items():
1214 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records)
1215 inserted_or_updated = self.butler.registry.syncDimensionData(
1216 "exposure",
1217 exposure.record,
1218 update=update_exposure_records,
1219 )
1220 except Exception as e:
1221 self._on_ingest_failure(exposure, e)
1222 n_exposures_failed += 1
1223 self.log.warning(
1224 "Exposure %s:%s could not be registered: %s",
1225 exposure.record.instrument,
1226 exposure.record.obs_id,
1227 e,
1228 )
1229 if self.config.failFast:
1230 raise e
1231 continue
1233 if isinstance(inserted_or_updated, dict):
1234 # Exposure is in the registry and we updated it, so
1235 # syncDimensionData returned a dict.
1236 self.log.info(
1237 "Exposure %s:%s was already present, but columns %s were updated.",
1238 exposure.record.instrument,
1239 exposure.record.obs_id,
1240 str(list(inserted_or_updated.keys())),
1241 )
1243 # Determine the instrument so we can work out the dataset type.
1244 instrument = exposure.files[0].instrument
1245 assert (
1246 instrument is not None
1247 ), "file should have been removed from this list by prep if instrument could not be found"
1249 if raw_definition := getattr(instrument, "raw_definition", None):
1250 datasetTypeName, dimensions, storageClass = raw_definition
1251 if not (datasetType := datasetTypes.get(datasetTypeName)):
1252 datasetType = DatasetType(
1253 datasetTypeName, dimensions, storageClass, universe=self.butler.dimensions
1254 )
1255 else:
1256 datasetType = self.datasetType
1257 if datasetType.name not in datasetTypes:
1258 self.butler.registry.registerDatasetType(datasetType)
1259 datasetTypes[datasetType.name] = datasetType
1261 # Override default run if nothing specified explicitly.
1262 if run is None:
1263 this_run = instrument.makeDefaultRawIngestRunName()
1264 else:
1265 this_run = run
1266 if this_run not in runs:
1267 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
1268 runs.add(this_run)
1269 try:
1270 datasets_for_exposure = self.ingestExposureDatasets(
1271 exposure,
1272 datasetType=datasetType,
1273 run=this_run,
1274 skip_existing_exposures=skip_existing_exposures,
1275 track_file_attrs=track_file_attrs,
1276 )
1277 except Exception as e:
1278 self._on_ingest_failure(exposure, e)
1279 n_ingests_failed += 1
1280 self.log.warning("Failed to ingest the following for reason: %s", e)
1281 for f in exposure.files:
1282 self.log.warning("- %s", f.filename)
1283 if self.config.failFast:
1284 raise e
1285 continue
1286 else:
1287 self._on_success(datasets_for_exposure)
1288 for dataset in datasets_for_exposure:
1289 refs.extend(dataset.refs)
1291 # Success for this exposure.
1292 n_exposures += 1
1293 self.log.info(
1294 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id
1295 )
1297 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
1299 @timeMethod
1300 def run(
1301 self,
1302 files: Iterable[ResourcePathExpression],
1303 *,
1304 pool: PoolType | None = None,
1305 processes: int = 1,
1306 run: str | None = None,
1307 file_filter: str | re.Pattern = r"\.fit[s]?\b",
1308 group_files: bool = True,
1309 skip_existing_exposures: bool = False,
1310 update_exposure_records: bool = False,
1311 track_file_attrs: bool = True,
1312 ) -> list[DatasetRef]:
1313 """Ingest files into a Butler data repository.
1315 This creates any new exposure or visit Dimension entries needed to
1316 identify the ingested files, creates new Dataset entries in the
1317 Registry and finally ingests the files themselves into the Datastore.
1318 Any needed instrument, detector, and physical_filter Dimension entries
1319 must exist in the Registry before `run` is called.
1321 Parameters
1322 ----------
1323 files : iterable `lsst.resources.ResourcePath`, `str` or path-like
1324 Paths to the files to be ingested. Can refer to directories.
1325 Will be made absolute if they are not already.
1326 pool : `multiprocessing.Pool`, optional
1327 If not `None`, a process pool with which to parallelize some
1328 operations.
1329 processes : `int`, optional
1330 The number of processes to use. Ignored if ``pool`` is not `None`.
1331 run : `str`, optional
1332 Name of a RUN-type collection to write to, overriding
1333 the default derived from the instrument name.
1334 file_filter : `str` or `re.Pattern`, optional
1335 Pattern to use to discover files to ingest within directories.
1336 The default is to search for FITS files. The regex applies to
1337 files within the directory.
1338 group_files : `bool`, optional
1339 Group files by directory if they have been discovered in
1340 directories. Will not affect files explicitly provided.
1341 skip_existing_exposures : `bool`, optional
1342 If `True` (`False` is default), skip raws that have already been
1343 ingested (i.e. raws for which we already have a dataset with the
1344 same data ID in the target collection, even if from another file).
1345 Note that this is much slower than just not passing
1346 already-ingested files as inputs, because we still need to read and
1347 process metadata to identify which exposures to search for. It
1348 also will not work reliably if multiple processes are attempting to
1349 ingest raws from the same exposure concurrently, in that different
1350 processes may still attempt to ingest the same raw and conflict,
1351 causing a failure that prevents other raws from the same exposure
1352 from being ingested.
1353 update_exposure_records : `bool`, optional
1354 If `True` (`False` is default), update existing exposure records
1355 that conflict with the new ones instead of rejecting them. THIS IS
1356 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1357 KNOWN TO BE BAD. This should usually be combined with
1358 ``skip_existing_exposures=True``.
1359 track_file_attrs : `bool`, optional
1360 Control whether file attributes such as the size or checksum should
1361 be tracked by the datastore. Whether this parameter is honored
1362 depends on the specific datastore implementation.
1364 Returns
1365 -------
1366 refs : `list` of `lsst.daf.butler.DatasetRef`
1367 Dataset references for ingested raws.
1369 Notes
1370 -----
1371 This method inserts all datasets for an exposure within a transaction,
1372 guaranteeing that partial exposures are never ingested. The exposure
1373 dimension record is inserted with `Registry.syncDimensionData` first
1374 (in its own transaction), which inserts only if a record with the same
1375 primary key does not already exist. This allows different files within
1376 the same exposure to be ingested in different runs.
1377 """
1378 refs = []
1379 bad_files = []
1380 n_exposures = 0
1381 n_exposures_failed = 0
1382 n_ingests_failed = 0
1383 if group_files:
1384 for group in ResourcePath.findFileResources(files, file_filter, group_files):
1385 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1386 group,
1387 pool=pool,
1388 processes=processes,
1389 run=run,
1390 skip_existing_exposures=skip_existing_exposures,
1391 update_exposure_records=update_exposure_records,
1392 track_file_attrs=track_file_attrs,
1393 )
1394 refs.extend(new_refs)
1395 bad_files.extend(bad)
1396 n_exposures += n_exp
1397 n_exposures_failed += n_exp_fail
1398 n_ingests_failed += n_ingest_fail
1399 else:
1400 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1401 ResourcePath.findFileResources(files, file_filter, group_files),
1402 pool=pool,
1403 processes=processes,
1404 run=run,
1405 skip_existing_exposures=skip_existing_exposures,
1406 update_exposure_records=update_exposure_records,
1407 )
1409 had_failure = False
1411 if bad_files:
1412 had_failure = True
1413 self.log.warning("Could not extract observation metadata from the following:")
1414 for f in bad_files:
1415 self.log.warning("- %s", f)
1417 self.log.info(
1418 "Successfully processed data from %d exposure%s with %d failure%s from exposure"
1419 " registration and %d failure%s from file ingest.",
1420 *_log_msg_counter(n_exposures),
1421 *_log_msg_counter(n_exposures_failed),
1422 *_log_msg_counter(n_ingests_failed),
1423 )
1424 if n_exposures_failed > 0 or n_ingests_failed > 0:
1425 had_failure = True
1426 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1428 if had_failure:
1429 raise RuntimeError("Some failures encountered during ingestion")
1431 return refs