Coverage for python/lsst/obs/base/ingest.py: 16%
373 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-23 10:53 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-23 10:53 +0000
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from collections import defaultdict
28from collections.abc import Callable, Iterable, Iterator, MutableMapping, Sized
29from dataclasses import InitVar, dataclass
30from multiprocessing import Pool
31from typing import Any, ClassVar
33from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers
34from astro_metadata_translator.indexing import process_index_data, process_sidecar_data
35from lsst.afw.fits import readMetadata
36from lsst.daf.butler import (
37 Butler,
38 CollectionType,
39 DataCoordinate,
40 DatasetIdGenEnum,
41 DatasetRef,
42 DatasetType,
43 DimensionRecord,
44 DimensionUniverse,
45 FileDataset,
46 Formatter,
47 Progress,
48 Timespan,
49)
50from lsst.pex.config import ChoiceField, Config, Field
51from lsst.pipe.base import Instrument, Task
52from lsst.resources import ResourcePath, ResourcePathExpression
53from lsst.utils.timer import timeMethod
55from ._instrument import makeExposureRecordFromObsInfo
57# multiprocessing.Pool is actually a function, not a type, and the real type
58# isn't exposed, so we can't used it annotations, so we'll just punt on it via
59# this alias instead.
60PoolType = Any
63def _do_nothing(*args: Any, **kwargs: Any) -> None:
64 """Do nothing.
66 This is a function that accepts anything and does nothing.
67 For use as a default in callback arguments.
68 """
69 pass
72def _log_msg_counter(noun: int | Sized) -> tuple[int, str]:
73 """Count the iterable and return the count and plural modifier.
75 Parameters
76 ----------
77 noun : `Sized` or `int`
78 Thing to count. If given an integer it is assumed to be the count
79 to use to calculate modifier.
81 Returns
82 -------
83 num : `int`
84 Number of items found in ``noun``.
85 modifier : `str`
86 Character to add to the end of a string referring to these items
87 to indicate whether it was a single item or not. Returns empty
88 string if there is one item or "s" otherwise.
90 Examples
91 --------
92 .. code-block:: python
94 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
95 """
96 if isinstance(noun, int):
97 num = noun
98 else:
99 num = len(noun)
100 return num, "" if num == 1 else "s"
103@dataclass
104class RawFileDatasetInfo:
105 """Information about a single dataset within a raw file."""
107 dataId: DataCoordinate
108 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
110 obsInfo: ObservationInfo
111 """Standardized observation metadata extracted directly from the file
112 headers (`astro_metadata_translator.ObservationInfo`).
113 """
116@dataclass
117class RawFileData:
118 """Information about a single raw file, used during ingest."""
120 datasets: list[RawFileDatasetInfo]
121 """The information describing each dataset within this raw file.
122 (`list` of `RawFileDatasetInfo`)
123 """
125 filename: ResourcePath
126 """URI of the file this information was extracted from (`str`).
128 This is the path prior to ingest, not the path after ingest.
129 """
131 FormatterClass: type[Formatter]
132 """Formatter class that should be used to ingest this file (`type`; as
133 subclass of `~lsst.daf.butler.Formatter`).
134 """
136 instrument: Instrument | None
137 """The `Instrument` instance associated with this file. Can be `None`
138 if ``datasets`` is an empty list."""
141@dataclass
142class RawExposureData:
143 """Information about a complete raw exposure, used during ingest."""
145 dataId: DataCoordinate
146 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
147 """
149 files: list[RawFileData]
150 """List of structures containing file-level information.
151 """
153 universe: InitVar[DimensionUniverse]
154 """Set of all known dimensions.
155 """
157 record: DimensionRecord
158 """The exposure `DimensionRecord` that must be inserted into the
159 `~lsst.daf.butler.Registry` prior to file-level ingest
160 (`~lsst.daf.butler.DimensionRecord`).
161 """
163 dependencyRecords: dict[str, DimensionRecord]
164 """Additional records that must be inserted into the
165 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record``
166 (e.g., to satisfy foreign key constraints), indexed by the dimension name.
167 """
170def makeTransferChoiceField(
171 doc: str = "How to transfer files (None for no transfer).", default: str = "auto"
172) -> ChoiceField:
173 """Create a Config field with options for transferring data between repos.
175 The allowed options for the field are exactly those supported by
176 `lsst.daf.butler.Datastore.ingest`.
178 Parameters
179 ----------
180 doc : `str`
181 Documentation for the configuration field.
182 default : `str`, optional
183 Default transfer mode for the field.
185 Returns
186 -------
187 field : `lsst.pex.config.ChoiceField`
188 Configuration field.
189 """
190 return ChoiceField(
191 doc=doc,
192 dtype=str,
193 allowed={
194 "move": "move",
195 "copy": "copy",
196 "auto": "choice will depend on datastore",
197 "direct": "use URI to ingested file directly in datastore",
198 "link": "hard link falling back to symbolic link",
199 "hardlink": "hard link",
200 "symlink": "symbolic (soft) link",
201 "relsymlink": "relative symbolic link",
202 },
203 optional=True,
204 default=default,
205 )
208class RawIngestConfig(Config):
209 """Configuration class for RawIngestTask."""
211 transfer = makeTransferChoiceField()
212 failFast: Field[bool] = Field(
213 dtype=bool,
214 default=False,
215 doc="If True, stop ingest as soon as any problem is encountered with any file. "
216 "Otherwise problem files will be skipped and logged and a report issued at completion.",
217 )
220class RawIngestTask(Task):
221 """Driver Task for ingesting raw data into Gen3 Butler repositories.
223 Parameters
224 ----------
225 config : `RawIngestConfig`
226 Configuration for the task.
227 butler : `~lsst.daf.butler.Butler`
228 Writeable butler instance, with ``butler.run`` set to the appropriate
229 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
230 datasets.
231 on_success : `Callable`, optional
232 A callback invoked when all of the raws associated with an exposure
233 are ingested. Will be passed a list of `FileDataset` objects, each
234 containing one or more resolved `DatasetRef` objects. If this callback
235 raises it will interrupt the entire ingest process, even if
236 `RawIngestConfig.failFast` is `False`.
237 on_metadata_failure : `Callable`, optional
238 A callback invoked when a failure occurs trying to translate the
239 metadata for a file. Will be passed the URI and the exception, in
240 that order, as positional arguments. Guaranteed to be called in an
241 ``except`` block, allowing the callback to re-raise or replace (with
242 ``raise ... from``) to override the task's usual error handling (before
243 `RawIngestConfig.failFast` logic occurs).
244 on_ingest_failure : `Callable`, optional
245 A callback invoked when dimension record or dataset insertion into the
246 database fails for an exposure. Will be passed a `RawExposureData`
247 instance and the exception, in that order, as positional arguments.
248 Guaranteed to be called in an ``except`` block, allowing the callback
249 to re-raise or replace (with ``raise ... from``) to override the task's
250 usual error handling (before `RawIngestConfig.failFast` logic occurs).
251 **kwargs
252 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
253 constructor.
255 Notes
256 -----
257 Each instance of `RawIngestTask` writes to the same Butler. Each
258 invocation of `RawIngestTask.run` ingests a list of files.
259 """
261 ConfigClass: ClassVar[type[Config]] = RawIngestConfig
263 _DefaultName: ClassVar[str] = "ingest"
265 def getDatasetType(self) -> DatasetType:
266 """Return the default DatasetType of the datasets ingested by this
267 Task.
269 Returns
270 -------
271 datasetType : `DatasetType`
272 The default dataset type to use for the data being ingested. This
273 is only used if the relevant `~lsst.pipe.base.Instrument` does not
274 define an override.
275 """
276 return DatasetType(
277 "raw",
278 ("instrument", "detector", "exposure"),
279 "Exposure",
280 universe=self.butler.dimensions,
281 )
283 # Mypy can not determine that the config passed to super() is this type.
284 config: RawIngestConfig
286 def __init__(
287 self,
288 config: RawIngestConfig,
289 *,
290 butler: Butler,
291 on_success: Callable[[list[FileDataset]], Any] = _do_nothing,
292 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing,
293 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
294 **kwargs: Any,
295 ):
296 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
297 super().__init__(config, **kwargs)
298 self.butler = butler
299 self.universe = self.butler.dimensions
300 self.datasetType = self.getDatasetType()
301 self._on_success = on_success
302 self._on_metadata_failure = on_metadata_failure
303 self._on_ingest_failure = on_ingest_failure
304 self.progress = Progress("obs.base.RawIngestTask")
306 # Import all the instrument classes so that we ensure that we
307 # have all the relevant metadata translators loaded.
308 Instrument.importAll(self.butler.registry)
310 # Read all the instrument records into a cache since they will be
311 # needed later to calculate day_obs timespans, if appropriate.
312 self._instrument_records = {
313 rec.name: rec for rec in butler.registry.queryDimensionRecords("instrument")
314 }
316 def _reduce_kwargs(self) -> dict[str, Any]:
317 # Add extra parameters to pickle.
318 return dict(
319 **super()._reduce_kwargs(),
320 butler=self.butler,
321 on_success=self._on_success,
322 on_metadata_failure=self._on_metadata_failure,
323 on_ingest_failure=self._on_ingest_failure,
324 )
326 def _determine_instrument_formatter(
327 self, dataId: DataCoordinate, filename: ResourcePath
328 ) -> tuple[Instrument | None, type[Formatter]]:
329 """Determine the instrument and formatter class.
331 Parameters
332 ----------
333 dataId : `lsst.daf.butler.DataCoordinate`
334 The dataId associated with this dataset.
335 filename : `lsst.resources.ResourcePath`
336 URI of file used for error reporting.
338 Returns
339 -------
340 instrument : `Instrument` or `None`
341 Instance of the `Instrument` associated with this dataset. `None`
342 indicates that the instrument could not be determined.
343 formatterClass : `type`
344 Class to be used as the formatter for this dataset.
345 """
346 # The data model currently assumes that whilst multiple datasets
347 # can be associated with a single file, they must all share the
348 # same formatter.
349 try:
350 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore
351 except LookupError as e:
352 self._on_metadata_failure(filename, e)
353 self.log.warning(
354 "Instrument %s for file %s not known to registry", dataId["instrument"], filename
355 )
356 if self.config.failFast:
357 raise RuntimeError(
358 f"Instrument {dataId['instrument']} for file {filename} not known to registry"
359 ) from e
360 FormatterClass = Formatter
361 # Indicate that we could not work out the instrument.
362 instrument = None
363 else:
364 assert instrument is not None, "Should be guaranted by fromName succeeding."
365 FormatterClass = instrument.getRawFormatter(dataId)
366 return instrument, FormatterClass
368 def extractMetadata(self, filename: ResourcePath) -> RawFileData:
369 """Extract and process metadata from a single raw file.
371 Parameters
372 ----------
373 filename : `lsst.resources.ResourcePath`
374 URI to the file.
376 Returns
377 -------
378 data : `RawFileData`
379 A structure containing the metadata extracted from the file,
380 as well as the original filename. All fields will be populated,
381 but the `RawFileData.dataId` attribute will be a minimal
382 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
383 ``instrument`` field will be `None` if there is a problem
384 with metadata extraction.
386 Notes
387 -----
388 Assumes that there is a single dataset associated with the given
389 file. Instruments using a single file to store multiple datasets
390 must implement their own version of this method.
392 By default the method will catch all exceptions unless the ``failFast``
393 configuration item is `True`. If an error is encountered the
394 `_on_metadata_failure()` method will be called. If no exceptions
395 result and an error was encountered the returned object will have
396 a null-instrument class and no datasets.
398 This method supports sidecar JSON files which can be used to
399 extract metadata without having to read the data file itself.
400 The sidecar file is always used if found.
401 """
402 sidecar_fail_msg = "" # Requires prepended space when set.
403 try:
404 sidecar_file = filename.updatedExtension(".json")
405 if sidecar_file.exists():
406 content = json.loads(sidecar_file.read())
407 headers = [process_sidecar_data(content)]
408 sidecar_fail_msg = " (via sidecar)"
409 else:
410 # Read the metadata from the data file itself.
412 # For remote files download the entire file to get the
413 # header. This is very inefficient and it would be better
414 # to have some way of knowing where in the file the headers
415 # are and to only download those parts of the file.
416 with filename.as_local() as local_file:
417 # Read the primary. This might be sufficient.
418 header = readMetadata(local_file.ospath, 0)
419 translator_class = None
421 try:
422 # Try to work out a translator class early.
423 translator_class = MetadataTranslator.determine_translator(
424 header, filename=str(filename)
425 )
426 except ValueError:
427 # Primary header was not sufficient (maybe this file
428 # has been compressed or is a MEF with minimal
429 # primary). Read second header and merge with primary.
430 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
432 # Try again to work out a translator class, letting this
433 # fail.
434 if translator_class is None:
435 translator_class = MetadataTranslator.determine_translator(
436 header, filename=str(filename)
437 )
439 # Request the headers to use for ingest
440 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header))
442 # Add each header to the dataset list
443 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
445 except Exception as e:
446 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
447 # Indicate to the caller that we failed to read.
448 datasets = []
449 formatterClass = Formatter
450 instrument = None
451 self._on_metadata_failure(filename, e)
452 if self.config.failFast:
453 raise RuntimeError(
454 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}"
455 ) from e
456 else:
457 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
458 # The data model currently assumes that whilst multiple datasets
459 # can be associated with a single file, they must all share the
460 # same formatter.
461 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
462 if instrument is None:
463 datasets = []
465 return RawFileData(
466 datasets=datasets,
467 filename=filename,
468 # MyPy wants this to be a non-abstract class, which is not true
469 # for the error case where instrument is None and datasets=[].
470 FormatterClass=formatterClass, # type: ignore
471 instrument=instrument,
472 )
474 @classmethod
475 def getObservationInfoSubsets(cls) -> tuple[set, set]:
476 """Return subsets of fields in the `ObservationInfo` that we care
477 about.
479 These fields will be used in constructing an exposure record.
481 Returns
482 -------
483 required : `set`
484 Set of `ObservationInfo` field names that are required.
485 optional : `set`
486 Set of `ObservationInfo` field names we will use if they are
487 available.
488 """
489 # Marking the new properties "group_counter_*" and
490 # "has_simulated_content" as required, assumes that we either
491 # recreate any existing index/sidecar files that include translated
492 # values, or else allow astro_metadata_translator to fill in
493 # defaults.
494 required = {
495 "datetime_begin",
496 "datetime_end",
497 "detector_num",
498 "exposure_group",
499 "exposure_id",
500 "exposure_time",
501 "group_counter_end",
502 "group_counter_start",
503 "has_simulated_content",
504 "instrument",
505 "observation_id",
506 "observation_type",
507 "observing_day",
508 "physical_filter",
509 }
510 optional = {
511 "altaz_begin",
512 "boresight_rotation_coord",
513 "boresight_rotation_angle",
514 "dark_time",
515 "tracking_radec",
516 "object",
517 "observation_counter",
518 "observation_reason",
519 "observing_day_offset",
520 "science_program",
521 "visit_id",
522 }
523 return required, optional
525 def _calculate_dataset_info(
526 self, header: MutableMapping[str, Any] | ObservationInfo, filename: ResourcePath
527 ) -> RawFileDatasetInfo:
528 """Calculate a RawFileDatasetInfo from the supplied information.
530 Parameters
531 ----------
532 header : Mapping or `astro_metadata_translator.ObservationInfo`
533 Header from the dataset or previously-translated content.
534 filename : `lsst.resources.ResourcePath`
535 Filename to use for error messages.
537 Returns
538 -------
539 dataset : `RawFileDatasetInfo`
540 The dataId, and observation information associated with this
541 dataset.
542 """
543 required, optional = self.getObservationInfoSubsets()
544 if isinstance(header, ObservationInfo):
545 obsInfo = header
546 missing = []
547 # Need to check the required properties are present.
548 for property in required:
549 # getattr does not need to be protected because it is using
550 # the defined list above containing properties that must exist.
551 value = getattr(obsInfo, property)
552 if value is None:
553 missing.append(property)
554 if missing:
555 raise ValueError(
556 f"Requested required properties are missing from file {filename}: {missing} (via JSON)"
557 )
559 else:
560 obsInfo = ObservationInfo(
561 header,
562 pedantic=False,
563 filename=str(filename),
564 required=required,
565 subset=required | optional,
566 )
568 dataId = DataCoordinate.standardize(
569 instrument=obsInfo.instrument,
570 exposure=obsInfo.exposure_id,
571 detector=obsInfo.detector_num,
572 universe=self.universe,
573 )
574 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
576 def locateAndReadIndexFiles(
577 self, files: Iterable[ResourcePath]
578 ) -> tuple[dict[ResourcePath, Any], list[ResourcePath], set[ResourcePath], set[ResourcePath]]:
579 """Given a list of files, look for index files and read them.
581 Index files can either be explicitly in the list of files to
582 ingest, or else located in the same directory as a file to ingest.
583 Index entries are always used if present.
585 Parameters
586 ----------
587 files : iterable over `lsst.resources.ResourcePath`
588 URIs to the files to be ingested.
590 Returns
591 -------
592 index : `dict` [`ResourcePath`, Any]
593 Merged contents of all relevant index files found. These can
594 be explicitly specified index files or ones found in the
595 directory alongside a data file to be ingested.
596 updated_files : `list` of `ResourcePath`
597 Updated list of the input files with entries removed that were
598 found listed in an index file. Order is not guaranteed to
599 match the order of the files given to this routine.
600 good_index_files: `set` [ `ResourcePath` ]
601 Index files that were successfully read.
602 bad_index_files: `set` [ `ResourcePath` ]
603 Files that looked like index files but failed to read properly.
604 """
605 # Convert the paths to absolute for easy comparison with index content.
606 # Do not convert to real paths since we have to assume that index
607 # files are in this location and not the location which it links to.
608 files = tuple(f.abspath() for f in files)
610 # Index files must be named this.
611 index_root_file = "_index.json"
613 # Group the files by directory.
614 files_by_directory = defaultdict(set)
616 for path in files:
617 directory, file_in_dir = path.split()
618 files_by_directory[directory].add(file_in_dir)
620 # All the metadata read from index files with keys of full path.
621 index_entries: dict[ResourcePath, Any] = {}
623 # Index files we failed to read.
624 bad_index_files = set()
626 # Any good index files that were found and used.
627 good_index_files = set()
629 # Look for index files in those directories.
630 for directory, files_in_directory in files_by_directory.items():
631 possible_index_file = directory.join(index_root_file)
632 if possible_index_file.exists():
633 # If we are explicitly requesting an index file the
634 # messages should be different.
635 index_msg = "inferred"
636 is_implied = True
637 if index_root_file in files_in_directory:
638 index_msg = "explicit"
639 is_implied = False
641 # Try to read the index file and catch and report any
642 # problems.
643 try:
644 content = json.loads(possible_index_file.read())
645 index = process_index_data(content, force_dict=True)
646 # mypy should in theory know that this is a mapping
647 # from the overload type annotation of process_index_data.
648 assert isinstance(index, MutableMapping)
649 except Exception as e:
650 # Only trigger the callback if the index file
651 # was asked for explicitly. Triggering on implied file
652 # might be surprising.
653 if not is_implied:
654 self._on_metadata_failure(possible_index_file, e)
655 if self.config.failFast:
656 raise RuntimeError(
657 f"Problem reading index file from {index_msg} location {possible_index_file}"
658 ) from e
659 bad_index_files.add(possible_index_file)
660 continue
662 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
663 good_index_files.add(possible_index_file)
665 # Go through the index adding entries for files.
666 # If we have non-index files in this directory marked for
667 # ingest we should only get index information for those.
668 # If the index file was explicit we use all entries.
669 if is_implied:
670 files_to_ingest = files_in_directory
671 else:
672 files_to_ingest = set(index)
674 # Copy relevant metadata into a single dict for all index
675 # entries.
676 for file_in_dir in files_to_ingest:
677 # Skip an explicitly specified index file.
678 # This should never happen because an explicit index
679 # file will force ingest of all files in the index
680 # and not use the explicit file list. If somehow
681 # this is not true we continue. Raising an exception
682 # seems like the wrong thing to do since this is harmless.
683 if file_in_dir == index_root_file:
684 self.log.info(
685 "Logic error found scanning directory %s. Please file ticket.", directory
686 )
687 continue
688 if file_in_dir in index:
689 file = directory.join(file_in_dir)
690 if file in index_entries:
691 # ObservationInfo overrides raw metadata
692 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance(
693 index_entries[file], ObservationInfo
694 ):
695 self.log.warning(
696 "File %s already specified in an index file but overriding"
697 " with ObservationInfo content from %s",
698 file,
699 possible_index_file,
700 )
701 else:
702 self.log.warning(
703 "File %s already specified in an index file, ignoring content from %s",
704 file,
705 possible_index_file,
706 )
707 # Do nothing in this case
708 continue
710 index_entries[file] = index[file_in_dir]
712 # Remove files from list that have index entries and also
713 # any files that we determined to be explicit index files
714 # or any index files that we failed to read.
715 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
717 # The filtered list loses the initial order. Retaining the order
718 # is good for testing but does have a cost if there are many
719 # files when copying the good values out. A dict would have faster
720 # lookups (using the files as keys) but use more memory.
721 ordered = [f for f in filtered if f in files]
723 return index_entries, ordered, good_index_files, bad_index_files
725 def processIndexEntries(self, index_entries: dict[ResourcePath, Any]) -> list[RawFileData]:
726 """Convert index entries to RawFileData.
728 Parameters
729 ----------
730 index_entries : `dict` [`ResourcePath`, Any]
731 Dict indexed by name of file to ingest and with keys either
732 raw metadata or translated
733 `~astro_metadata_translator.ObservationInfo`.
735 Returns
736 -------
737 data : `list` [ `RawFileData` ]
738 Structures containing the metadata extracted from the file,
739 as well as the original filename. All fields will be populated,
740 but the `RawFileData.dataId` attributes will be minimal
741 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances.
742 """
743 fileData = []
744 for filename, metadata in index_entries.items():
745 try:
746 datasets = [self._calculate_dataset_info(metadata, filename)]
747 except Exception as e:
748 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e)
749 datasets = []
750 formatterClass = Formatter
751 instrument = None
752 self._on_metadata_failure(filename, e)
753 if self.config.failFast:
754 raise RuntimeError(
755 f"Problem extracting metadata for file {filename} found in index file"
756 ) from e
757 else:
758 instrument, formatterClass = self._determine_instrument_formatter(
759 datasets[0].dataId, filename
760 )
761 if instrument is None:
762 datasets = []
763 fileData.append(
764 RawFileData(
765 datasets=datasets,
766 filename=filename,
767 # MyPy wants this to be a non-abstract class, which is not
768 # true for the error case where instrument is None and
769 # datasets=[].
770 FormatterClass=formatterClass, # type: ignore
771 instrument=instrument,
772 )
773 )
774 return fileData
776 def groupByExposure(self, files: Iterable[RawFileData]) -> list[RawExposureData]:
777 """Group an iterable of `RawFileData` by exposure.
779 Parameters
780 ----------
781 files : iterable of `RawFileData`
782 File-level information to group.
784 Returns
785 -------
786 exposures : `list` of `RawExposureData`
787 A list of structures that group the file-level information by
788 exposure. All fields will be populated. The
789 `RawExposureData.dataId` attributes will be minimal (unexpanded)
790 `~lsst.daf.butler.DataCoordinate` instances.
791 """
792 exposureDimensions = self.universe["exposure"].graph
793 byExposure = defaultdict(list)
794 for f in files:
795 # Assume that the first dataset is representative for the file.
796 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
798 return [
799 RawExposureData(
800 dataId=dataId,
801 files=exposureFiles,
802 universe=self.universe,
803 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe),
804 dependencyRecords=self.makeDependencyRecords(
805 exposureFiles[0].datasets[0].obsInfo, self.universe
806 ),
807 )
808 for dataId, exposureFiles in byExposure.items()
809 ]
811 def makeExposureRecord(
812 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any
813 ) -> DimensionRecord:
814 """Construct a registry record for an exposure.
816 This is a method that subclasses will often want to customize. This can
817 often be done by calling this base class implementation with additional
818 ``kwargs``.
820 Parameters
821 ----------
822 obsInfo : `ObservationInfo`
823 Observation details for (one of the components of) the exposure.
824 universe : `DimensionUniverse`
825 Set of all known dimensions.
826 **kwargs
827 Additional field values for this record.
829 Returns
830 -------
831 record : `DimensionRecord`
832 The exposure record that must be inserted into the
833 `~lsst.daf.butler.Registry` prior to file-level ingest.
834 """
835 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs)
837 def makeDependencyRecords(
838 self, obsInfo: ObservationInfo, universe: DimensionUniverse
839 ) -> dict[str, DimensionRecord]:
840 """Construct dependency records.
842 These dependency records will be inserted into the
843 `~lsst.daf.butler.Registry` before the exposure records, because they
844 are dependencies of the exposure. This allows an opportunity to satisfy
845 foreign key constraints that exist because of dimensions related to the
846 exposure.
848 This is a method that subclasses may want to customize, if they've
849 added dimensions that relate to an exposure.
851 Parameters
852 ----------
853 obsInfo : `ObservationInfo`
854 Observation details for (one of the components of) the exposure.
855 universe : `DimensionUniverse`
856 Set of all known dimensions.
858 Returns
859 -------
860 records : `dict` [`str`, `DimensionRecord`]
861 The records to insert, indexed by dimension name.
862 """
863 records: dict[str, DimensionRecord] = {}
864 if "exposure" not in universe:
865 return records
866 exposure = universe["exposure"]
867 if "group" in exposure.implied:
868 records["group"] = universe["group"].RecordClass(
869 name=obsInfo.exposure_group,
870 instrument=obsInfo.instrument,
871 )
872 if "day_obs" in exposure.implied:
873 if (offset := getattr(obsInfo, "observing_day_offset")) is not None:
874 offset_int = round(offset.to_value("s"))
875 timespan = Timespan.from_day_obs(obsInfo.observing_day, offset_int)
876 else:
877 timespan = None
878 records["day_obs"] = universe["day_obs"].RecordClass(
879 instrument=obsInfo.instrument,
880 id=obsInfo.observing_day,
881 timespan=timespan,
882 )
883 return records
885 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
886 """Expand the data IDs associated with a raw exposure.
888 This adds the metadata records.
890 Parameters
891 ----------
892 exposure : `RawExposureData`
893 A structure containing information about the exposure to be
894 ingested. Must have `RawExposureData.record` populated. Should
895 be considered consumed upon return.
897 Returns
898 -------
899 exposure : `RawExposureData`
900 An updated version of the input structure, with
901 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
902 updated to data IDs for which
903 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
904 """
905 # We start by expanded the exposure-level data ID; we won't use that
906 # directly in file ingest, but this lets us do some database lookups
907 # once per exposure instead of once per file later.
908 data.dataId = self.butler.registry.expandDataId(
909 data.dataId,
910 # We pass in the records we'll be inserting shortly so they aren't
911 # looked up from the database. We do expect instrument and filter
912 # records to be retrieved from the database here (though the
913 # Registry may cache them so there isn't a lookup every time).
914 records={"exposure": data.record},
915 )
916 # Now we expand the per-file (exposure+detector) data IDs. This time
917 # we pass in the records we just retrieved from the exposure data ID
918 # expansion.
919 for file in data.files:
920 for dataset in file.datasets:
921 dataset.dataId = self.butler.registry.expandDataId(
922 dataset.dataId,
923 records={k: data.dataId.records[k] for k in data.dataId.dimensions.elements},
924 )
925 return data
927 def prep(
928 self, files: Iterable[ResourcePath], *, pool: PoolType | None = None
929 ) -> tuple[Iterator[RawExposureData], list[ResourcePath]]:
930 """Perform all non-database-updating ingest preprocessing steps.
932 Parameters
933 ----------
934 files : iterable over `str` or path-like objects
935 Paths to the files to be ingested. Will be made absolute
936 if they are not already.
937 pool : `multiprocessing.Pool`, optional
938 If not `None`, a process pool with which to parallelize some
939 operations.
941 Returns
942 -------
943 exposures : `Iterator` [ `RawExposureData` ]
944 Data structures containing dimension records, filenames, and data
945 IDs to be ingested (one structure for each exposure).
946 bad_files : `list` of `str`
947 List of all the files that could not have metadata extracted.
948 """
949 mapFunc = map if pool is None else pool.imap_unordered
951 def _partition_good_bad(
952 file_data: Iterable[RawFileData],
953 ) -> tuple[list[RawFileData], list[ResourcePath]]:
954 """Filter out bad files and return good with list of bad."""
955 good_files = []
956 bad_files = []
957 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"):
958 if not fileDatum.datasets:
959 bad_files.append(fileDatum.filename)
960 else:
961 good_files.append(fileDatum)
962 return good_files, bad_files
964 # Look for index files and read them.
965 # There should be far fewer index files than data files.
966 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
967 if bad_index_files:
968 self.log.info("Failed to read the following explicitly requested index files:")
969 for bad in sorted(bad_index_files):
970 self.log.info("- %s", bad)
972 # Now convert all the index file entries to standard form for ingest.
973 processed_bad_index_files: list[ResourcePath] = []
974 indexFileData = self.processIndexEntries(index_entries)
975 if indexFileData:
976 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData)
977 self.log.info(
978 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s",
979 *_log_msg_counter(indexFileData),
980 *_log_msg_counter(good_index_files),
981 *_log_msg_counter(processed_bad_index_files),
982 )
984 # Extract metadata and build per-detector regions.
985 # This could run in a subprocess so collect all output
986 # before looking at failures.
987 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
989 # Filter out all the failed reads and store them for later
990 # reporting.
991 good_file_data, bad_files = _partition_good_bad(fileData)
992 self.log.info(
993 "Successfully extracted metadata from %d file%s with %d failure%s",
994 *_log_msg_counter(good_file_data),
995 *_log_msg_counter(bad_files),
996 )
998 # Combine with data from index files.
999 good_file_data.extend(indexFileData)
1000 bad_files.extend(processed_bad_index_files)
1001 bad_files.extend(bad_index_files)
1003 # Use that metadata to group files (and extracted metadata) by
1004 # exposure. Never parallelized because it's intrinsically a gather
1005 # step.
1006 exposureData: list[RawExposureData] = self.groupByExposure(good_file_data)
1008 # The next operation operates on RawExposureData instances (one at
1009 # a time) in-place and then returns the modified instance. We call it
1010 # as a pass-through instead of relying on the arguments we pass in to
1011 # have been modified because in the parallel case those arguments are
1012 # going to be pickled and unpickled, and I'm not certain
1013 # multiprocessing is careful enough with that for output arguments to
1014 # work.
1016 # Expand the data IDs to include all dimension metadata; we need this
1017 # because we may need to generate path templates that rely on that
1018 # metadata.
1019 # This is the first step that involves actual database calls (but just
1020 # SELECTs), so if there's going to be a problem with connections vs.
1021 # multiple processes, or lock contention (in SQLite) slowing things
1022 # down, it'll happen here.
1023 return mapFunc(self.expandDataIds, exposureData), bad_files
1025 def ingestExposureDatasets(
1026 self,
1027 exposure: RawExposureData,
1028 datasetType: DatasetType,
1029 *,
1030 run: str,
1031 skip_existing_exposures: bool = False,
1032 track_file_attrs: bool = True,
1033 ) -> list[FileDataset]:
1034 """Ingest all raw files in one exposure.
1036 Parameters
1037 ----------
1038 exposure : `RawExposureData`
1039 A structure containing information about the exposure to be
1040 ingested. Must have `RawExposureData.records` populated and all
1041 data ID attributes expanded.
1042 datasetType : `DatasetType`
1043 The dataset type associated with this exposure.
1044 run : `str`
1045 Name of a RUN-type collection to write to.
1046 skip_existing_exposures : `bool`, optional
1047 If `True` (`False` is default), skip raws that have already been
1048 ingested (i.e. raws for which we already have a dataset with the
1049 same data ID in the target collection, even if from another file).
1050 Note that this is much slower than just not passing
1051 already-ingested files as inputs, because we still need to read and
1052 process metadata to identify which exposures to search for. It
1053 also will not work reliably if multiple processes are attempting to
1054 ingest raws from the same exposure concurrently, in that different
1055 processes may still attempt to ingest the same raw and conflict,
1056 causing a failure that prevents other raws from the same exposure
1057 from being ingested.
1058 track_file_attrs : `bool`, optional
1059 Control whether file attributes such as the size or checksum should
1060 be tracked by the datastore. Whether this parameter is honored
1061 depends on the specific datastore implementation.
1063 Returns
1064 -------
1065 datasets : `list` of `lsst.daf.butler.FileDataset`
1066 Per-file structures identifying the files ingested and their
1067 dataset representation in the data repository.
1068 """
1069 if skip_existing_exposures:
1070 existing = {
1071 ref.dataId
1072 for ref in self.butler.registry.queryDatasets(
1073 datasetType,
1074 collections=[run],
1075 dataId=exposure.dataId,
1076 )
1077 }
1078 else:
1079 existing = set()
1081 # Raw files are preferentially ingested using a UUID derived from
1082 # the collection name and dataId.
1083 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN):
1084 mode = DatasetIdGenEnum.DATAID_TYPE_RUN
1085 else:
1086 mode = DatasetIdGenEnum.UNIQUE
1088 datasets = []
1089 for file in exposure.files:
1090 refs = [
1091 DatasetRef(datasetType, d.dataId, run=run, id_generation_mode=mode)
1092 for d in file.datasets
1093 if d.dataId not in existing
1094 ]
1095 if refs:
1096 datasets.append(
1097 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass)
1098 )
1100 self.butler.ingest(
1101 *datasets,
1102 transfer=self.config.transfer,
1103 record_validation_info=track_file_attrs,
1104 )
1105 return datasets
1107 def ingestFiles(
1108 self,
1109 files: Iterable[ResourcePath],
1110 *,
1111 pool: PoolType | None = None,
1112 processes: int = 1,
1113 run: str | None = None,
1114 skip_existing_exposures: bool = False,
1115 update_exposure_records: bool = False,
1116 track_file_attrs: bool = True,
1117 ) -> tuple[list[DatasetRef], list[ResourcePath], int, int, int]:
1118 """Ingest files into a Butler data repository.
1120 This creates any new exposure or visit Dimension entries needed to
1121 identify the ingested files, creates new Dataset entries in the
1122 Registry and finally ingests the files themselves into the Datastore.
1123 Any needed instrument, detector, and physical_filter Dimension entries
1124 must exist in the Registry before `run` is called.
1126 Parameters
1127 ----------
1128 files : iterable over `lsst.resources.ResourcePath`
1129 URIs to the files to be ingested.
1130 pool : `multiprocessing.Pool`, optional
1131 If not `None`, a process pool with which to parallelize some
1132 operations.
1133 processes : `int`, optional
1134 The number of processes to use. Ignored if ``pool`` is not `None`.
1135 run : `str`, optional
1136 Name of a RUN-type collection to write to, overriding
1137 the default derived from the instrument name.
1138 skip_existing_exposures : `bool`, optional
1139 If `True` (`False` is default), skip raws that have already been
1140 ingested (i.e. raws for which we already have a dataset with the
1141 same data ID in the target collection, even if from another file).
1142 Note that this is much slower than just not passing
1143 already-ingested files as inputs, because we still need to read and
1144 process metadata to identify which exposures to search for. It
1145 also will not work reliably if multiple processes are attempting to
1146 ingest raws from the same exposure concurrently, in that different
1147 processes may still attempt to ingest the same raw and conflict,
1148 causing a failure that prevents other raws from the same exposure
1149 from being ingested.
1150 update_exposure_records : `bool`, optional
1151 If `True` (`False` is default), update existing exposure records
1152 that conflict with the new ones instead of rejecting them. THIS IS
1153 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1154 KNOWN TO BE BAD. This should usually be combined with
1155 ``skip_existing_exposures=True``.
1156 track_file_attrs : `bool`, optional
1157 Control whether file attributes such as the size or checksum should
1158 be tracked by the datastore. Whether this parameter is honored
1159 depends on the specific datastore implementation.
1161 Returns
1162 -------
1163 refs : `list` of `lsst.daf.butler.DatasetRef`
1164 Dataset references for ingested raws.
1165 bad_files : `list` of `ResourcePath`
1166 Given paths that could not be ingested.
1167 n_exposures : `int`
1168 Number of exposures successfully ingested.
1169 n_exposures_failed : `int`
1170 Number of exposures that failed when inserting dimension data.
1171 n_ingests_failed : `int`
1172 Number of exposures that failed when ingesting raw datasets.
1173 """
1174 created_pool = False
1175 if pool is None and processes > 1:
1176 pool = Pool(processes)
1177 created_pool = True
1179 try:
1180 exposureData, bad_files = self.prep(files, pool=pool)
1181 finally:
1182 if created_pool and pool:
1183 # The pool is not needed any more so close it if we created
1184 # it to ensure we clean up resources.
1185 pool.close()
1186 pool.join()
1188 # Up to this point, we haven't modified the data repository at all.
1189 # Now we finally do that, with one transaction per exposure. This is
1190 # not parallelized at present because the performance of this step is
1191 # limited by the database server. That may or may not change in the
1192 # future once we increase our usage of bulk inserts and reduce our
1193 # usage of savepoints; we've tried to get everything but the database
1194 # operations done in advance to reduce the time spent inside
1195 # transactions.
1196 refs = []
1197 runs = set()
1198 datasetTypes: dict[str, DatasetType] = {}
1199 n_exposures = 0
1200 n_exposures_failed = 0
1201 n_ingests_failed = 0
1202 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
1203 assert exposure.record is not None, "Should be guaranteed by prep()"
1204 self.log.debug(
1205 "Attempting to ingest %d file%s from exposure %s:%s",
1206 *_log_msg_counter(exposure.files),
1207 exposure.record.instrument,
1208 exposure.record.obs_id,
1209 )
1211 try:
1212 for name, record in exposure.dependencyRecords.items():
1213 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records)
1214 inserted_or_updated = self.butler.registry.syncDimensionData(
1215 "exposure",
1216 exposure.record,
1217 update=update_exposure_records,
1218 )
1219 except Exception as e:
1220 self._on_ingest_failure(exposure, e)
1221 n_exposures_failed += 1
1222 self.log.warning(
1223 "Exposure %s:%s could not be registered: %s",
1224 exposure.record.instrument,
1225 exposure.record.obs_id,
1226 e,
1227 )
1228 if self.config.failFast:
1229 raise e
1230 continue
1232 if isinstance(inserted_or_updated, dict):
1233 # Exposure is in the registry and we updated it, so
1234 # syncDimensionData returned a dict.
1235 self.log.info(
1236 "Exposure %s:%s was already present, but columns %s were updated.",
1237 exposure.record.instrument,
1238 exposure.record.obs_id,
1239 str(list(inserted_or_updated.keys())),
1240 )
1242 # Determine the instrument so we can work out the dataset type.
1243 instrument = exposure.files[0].instrument
1244 assert (
1245 instrument is not None
1246 ), "file should have been removed from this list by prep if instrument could not be found"
1248 if raw_definition := getattr(instrument, "raw_definition", None):
1249 datasetTypeName, dimensions, storageClass = raw_definition
1250 if not (datasetType := datasetTypes.get(datasetTypeName)):
1251 datasetType = DatasetType(
1252 datasetTypeName, dimensions, storageClass, universe=self.butler.dimensions
1253 )
1254 else:
1255 datasetType = self.datasetType
1256 if datasetType.name not in datasetTypes:
1257 self.butler.registry.registerDatasetType(datasetType)
1258 datasetTypes[datasetType.name] = datasetType
1260 # Override default run if nothing specified explicitly.
1261 if run is None:
1262 this_run = instrument.makeDefaultRawIngestRunName()
1263 else:
1264 this_run = run
1265 if this_run not in runs:
1266 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
1267 runs.add(this_run)
1268 try:
1269 datasets_for_exposure = self.ingestExposureDatasets(
1270 exposure,
1271 datasetType=datasetType,
1272 run=this_run,
1273 skip_existing_exposures=skip_existing_exposures,
1274 track_file_attrs=track_file_attrs,
1275 )
1276 except Exception as e:
1277 self._on_ingest_failure(exposure, e)
1278 n_ingests_failed += 1
1279 self.log.warning("Failed to ingest the following for reason: %s", e)
1280 for f in exposure.files:
1281 self.log.warning("- %s", f.filename)
1282 if self.config.failFast:
1283 raise e
1284 continue
1285 else:
1286 self._on_success(datasets_for_exposure)
1287 for dataset in datasets_for_exposure:
1288 refs.extend(dataset.refs)
1290 # Success for this exposure.
1291 n_exposures += 1
1292 self.log.info(
1293 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id
1294 )
1296 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
1298 @timeMethod
1299 def run(
1300 self,
1301 files: Iterable[ResourcePathExpression],
1302 *,
1303 pool: PoolType | None = None,
1304 processes: int = 1,
1305 run: str | None = None,
1306 file_filter: str | re.Pattern = r"\.fit[s]?\b",
1307 group_files: bool = True,
1308 skip_existing_exposures: bool = False,
1309 update_exposure_records: bool = False,
1310 track_file_attrs: bool = True,
1311 ) -> list[DatasetRef]:
1312 """Ingest files into a Butler data repository.
1314 This creates any new exposure or visit Dimension entries needed to
1315 identify the ingested files, creates new Dataset entries in the
1316 Registry and finally ingests the files themselves into the Datastore.
1317 Any needed instrument, detector, and physical_filter Dimension entries
1318 must exist in the Registry before `run` is called.
1320 Parameters
1321 ----------
1322 files : iterable `lsst.resources.ResourcePath`, `str` or path-like
1323 Paths to the files to be ingested. Can refer to directories.
1324 Will be made absolute if they are not already.
1325 pool : `multiprocessing.Pool`, optional
1326 If not `None`, a process pool with which to parallelize some
1327 operations.
1328 processes : `int`, optional
1329 The number of processes to use. Ignored if ``pool`` is not `None`.
1330 run : `str`, optional
1331 Name of a RUN-type collection to write to, overriding
1332 the default derived from the instrument name.
1333 file_filter : `str` or `re.Pattern`, optional
1334 Pattern to use to discover files to ingest within directories.
1335 The default is to search for FITS files. The regex applies to
1336 files within the directory.
1337 group_files : `bool`, optional
1338 Group files by directory if they have been discovered in
1339 directories. Will not affect files explicitly provided.
1340 skip_existing_exposures : `bool`, optional
1341 If `True` (`False` is default), skip raws that have already been
1342 ingested (i.e. raws for which we already have a dataset with the
1343 same data ID in the target collection, even if from another file).
1344 Note that this is much slower than just not passing
1345 already-ingested files as inputs, because we still need to read and
1346 process metadata to identify which exposures to search for. It
1347 also will not work reliably if multiple processes are attempting to
1348 ingest raws from the same exposure concurrently, in that different
1349 processes may still attempt to ingest the same raw and conflict,
1350 causing a failure that prevents other raws from the same exposure
1351 from being ingested.
1352 update_exposure_records : `bool`, optional
1353 If `True` (`False` is default), update existing exposure records
1354 that conflict with the new ones instead of rejecting them. THIS IS
1355 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1356 KNOWN TO BE BAD. This should usually be combined with
1357 ``skip_existing_exposures=True``.
1358 track_file_attrs : `bool`, optional
1359 Control whether file attributes such as the size or checksum should
1360 be tracked by the datastore. Whether this parameter is honored
1361 depends on the specific datastore implementation.
1363 Returns
1364 -------
1365 refs : `list` of `lsst.daf.butler.DatasetRef`
1366 Dataset references for ingested raws.
1368 Notes
1369 -----
1370 This method inserts all datasets for an exposure within a transaction,
1371 guaranteeing that partial exposures are never ingested. The exposure
1372 dimension record is inserted with `Registry.syncDimensionData` first
1373 (in its own transaction), which inserts only if a record with the same
1374 primary key does not already exist. This allows different files within
1375 the same exposure to be ingested in different runs.
1376 """
1377 refs = []
1378 bad_files = []
1379 n_exposures = 0
1380 n_exposures_failed = 0
1381 n_ingests_failed = 0
1382 if group_files:
1383 for group in ResourcePath.findFileResources(files, file_filter, group_files):
1384 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1385 group,
1386 pool=pool,
1387 processes=processes,
1388 run=run,
1389 skip_existing_exposures=skip_existing_exposures,
1390 update_exposure_records=update_exposure_records,
1391 track_file_attrs=track_file_attrs,
1392 )
1393 refs.extend(new_refs)
1394 bad_files.extend(bad)
1395 n_exposures += n_exp
1396 n_exposures_failed += n_exp_fail
1397 n_ingests_failed += n_ingest_fail
1398 else:
1399 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1400 ResourcePath.findFileResources(files, file_filter, group_files),
1401 pool=pool,
1402 processes=processes,
1403 run=run,
1404 skip_existing_exposures=skip_existing_exposures,
1405 update_exposure_records=update_exposure_records,
1406 )
1408 had_failure = False
1410 if bad_files:
1411 had_failure = True
1412 self.log.warning("Could not extract observation metadata from the following:")
1413 for f in bad_files:
1414 self.log.warning("- %s", f)
1416 self.log.info(
1417 "Successfully processed data from %d exposure%s with %d failure%s from exposure"
1418 " registration and %d failure%s from file ingest.",
1419 *_log_msg_counter(n_exposures),
1420 *_log_msg_counter(n_exposures_failed),
1421 *_log_msg_counter(n_ingests_failed),
1422 )
1423 if n_exposures_failed > 0 or n_ingests_failed > 0:
1424 had_failure = True
1425 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1427 if had_failure:
1428 raise RuntimeError("Some failures encountered during ingestion")
1430 return refs