Coverage for python/lsst/obs/base/ingest.py: 17%
354 statements
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-29 03:23 -0700
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-29 03:23 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27import warnings
28from collections import defaultdict
29from dataclasses import InitVar, dataclass
30from multiprocessing import Pool
31from typing import (
32 Any,
33 Callable,
34 ClassVar,
35 Dict,
36 Iterable,
37 Iterator,
38 List,
39 MutableMapping,
40 Optional,
41 Set,
42 Sized,
43 Tuple,
44 Type,
45 Union,
46)
48from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers
49from astro_metadata_translator.indexing import process_index_data, process_sidecar_data
50from lsst.afw.fits import readMetadata
51from lsst.daf.butler import (
52 Butler,
53 CollectionType,
54 DataCoordinate,
55 DatasetIdGenEnum,
56 DatasetRef,
57 DatasetType,
58 DimensionRecord,
59 DimensionUniverse,
60 FileDataset,
61 Formatter,
62 Progress,
63 UnresolvedRefWarning,
64)
65from lsst.pex.config import ChoiceField, Config, Field
66from lsst.pipe.base import Instrument, Task
67from lsst.resources import ResourcePath, ResourcePathExpression
68from lsst.utils.timer import timeMethod
70from ._instrument import makeExposureRecordFromObsInfo
72# multiprocessing.Pool is actually a function, not a type, and the real type
73# isn't exposed, so we can't used it annotations, so we'll just punt on it via
74# this alias instead.
75PoolType = Any
78def _do_nothing(*args: Any, **kwargs: Any) -> None:
79 """Do nothing.
81 This is a function that accepts anything and does nothing.
82 For use as a default in callback arguments.
83 """
84 pass
87def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]:
88 """Count the iterable and return the count and plural modifier.
90 Parameters
91 ----------
92 noun : `Sized` or `int`
93 Thing to count. If given an integer it is assumed to be the count
94 to use to calculate modifier.
96 Returns
97 -------
98 num : `int`
99 Number of items found in ``noun``.
100 modifier : `str`
101 Character to add to the end of a string referring to these items
102 to indicate whether it was a single item or not. Returns empty
103 string if there is one item or "s" otherwise.
105 Examples
106 --------
108 .. code-block:: python
110 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
111 """
112 if isinstance(noun, int):
113 num = noun
114 else:
115 num = len(noun)
116 return num, "" if num == 1 else "s"
119@dataclass
120class RawFileDatasetInfo:
121 """Information about a single dataset within a raw file."""
123 dataId: DataCoordinate
124 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
126 obsInfo: ObservationInfo
127 """Standardized observation metadata extracted directly from the file
128 headers (`astro_metadata_translator.ObservationInfo`).
129 """
132@dataclass
133class RawFileData:
134 """Information about a single raw file, used during ingest."""
136 datasets: List[RawFileDatasetInfo]
137 """The information describing each dataset within this raw file.
138 (`list` of `RawFileDatasetInfo`)
139 """
141 filename: ResourcePath
142 """URI of the file this information was extracted from (`str`).
144 This is the path prior to ingest, not the path after ingest.
145 """
147 FormatterClass: Type[Formatter]
148 """Formatter class that should be used to ingest this file (`type`; as
149 subclass of `Formatter`).
150 """
152 instrument: Optional[Instrument]
153 """The `Instrument` instance associated with this file. Can be `None`
154 if ``datasets`` is an empty list."""
157@dataclass
158class RawExposureData:
159 """Information about a complete raw exposure, used during ingest."""
161 dataId: DataCoordinate
162 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
163 """
165 files: List[RawFileData]
166 """List of structures containing file-level information.
167 """
169 universe: InitVar[DimensionUniverse]
170 """Set of all known dimensions.
171 """
173 record: DimensionRecord
174 """The exposure `DimensionRecord` that must be inserted into the
175 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
176 """
178 dependencyRecords: Dict[str, DimensionRecord]
179 """Additional records that must be inserted into the
180 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record``
181 (e.g., to satisfy foreign key constraints), indexed by the dimension name.
182 """
185def makeTransferChoiceField(
186 doc: str = "How to transfer files (None for no transfer).", default: str = "auto"
187) -> ChoiceField:
188 """Create a Config field with options for transferring data between repos.
190 The allowed options for the field are exactly those supported by
191 `lsst.daf.butler.Datastore.ingest`.
193 Parameters
194 ----------
195 doc : `str`
196 Documentation for the configuration field.
197 default : `str`, optional
198 Default transfer mode for the field.
200 Returns
201 -------
202 field : `lsst.pex.config.ChoiceField`
203 Configuration field.
204 """
205 return ChoiceField(
206 doc=doc,
207 dtype=str,
208 allowed={
209 "move": "move",
210 "copy": "copy",
211 "auto": "choice will depend on datastore",
212 "direct": "use URI to ingested file directly in datastore",
213 "link": "hard link falling back to symbolic link",
214 "hardlink": "hard link",
215 "symlink": "symbolic (soft) link",
216 "relsymlink": "relative symbolic link",
217 },
218 optional=True,
219 default=default,
220 )
223class RawIngestConfig(Config):
224 """Configuration class for RawIngestTask."""
226 transfer = makeTransferChoiceField()
227 failFast: Field[bool] = Field(
228 dtype=bool,
229 default=False,
230 doc="If True, stop ingest as soon as any problem is encountered with any file. "
231 "Otherwise problem files will be skipped and logged and a report issued at completion.",
232 )
235class RawIngestTask(Task):
236 """Driver Task for ingesting raw data into Gen3 Butler repositories.
238 Parameters
239 ----------
240 config : `RawIngestConfig`
241 Configuration for the task.
242 butler : `~lsst.daf.butler.Butler`
243 Writeable butler instance, with ``butler.run`` set to the appropriate
244 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
245 datasets.
246 on_success : `Callable`, optional
247 A callback invoked when all of the raws associated with an exposure
248 are ingested. Will be passed a list of `FileDataset` objects, each
249 containing one or more resolved `DatasetRef` objects. If this callback
250 raises it will interrupt the entire ingest process, even if
251 `RawIngestConfig.failFast` is `False`.
252 on_metadata_failure : `Callable`, optional
253 A callback invoked when a failure occurs trying to translate the
254 metadata for a file. Will be passed the URI and the exception, in
255 that order, as positional arguments. Guaranteed to be called in an
256 ``except`` block, allowing the callback to re-raise or replace (with
257 ``raise ... from``) to override the task's usual error handling (before
258 `RawIngestConfig.failFast` logic occurs).
259 on_ingest_failure : `Callable`, optional
260 A callback invoked when dimension record or dataset insertion into the
261 database fails for an exposure. Will be passed a `RawExposureData`
262 instance and the exception, in that order, as positional arguments.
263 Guaranteed to be called in an ``except`` block, allowing the callback
264 to re-raise or replace (with ``raise ... from``) to override the task's
265 usual error handling (before `RawIngestConfig.failFast` logic occurs).
266 **kwargs
267 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
268 constructor.
270 Notes
271 -----
272 Each instance of `RawIngestTask` writes to the same Butler. Each
273 invocation of `RawIngestTask.run` ingests a list of files.
274 """
276 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig
278 _DefaultName: ClassVar[str] = "ingest"
280 def getDatasetType(self) -> DatasetType:
281 """Return the default DatasetType of the datasets ingested by this
282 Task.
284 Returns
285 -------
286 datasetType : `DatasetType`
287 The default dataset type to use for the data being ingested. This
288 is only used if the relevant `~lsst.pipe.base.Instrument` does not
289 define an override.
290 """
291 return DatasetType(
292 "raw",
293 ("instrument", "detector", "exposure"),
294 "Exposure",
295 universe=self.butler.registry.dimensions,
296 )
298 # Mypy can not determine that the config passed to super() is this type.
299 config: RawIngestConfig
301 def __init__(
302 self,
303 config: RawIngestConfig,
304 *,
305 butler: Butler,
306 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
307 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing,
308 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
309 **kwargs: Any,
310 ):
311 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
312 super().__init__(config, **kwargs)
313 self.butler = butler
314 self.universe = self.butler.registry.dimensions
315 self.datasetType = self.getDatasetType()
316 self._on_success = on_success
317 self._on_metadata_failure = on_metadata_failure
318 self._on_ingest_failure = on_ingest_failure
319 self.progress = Progress("obs.base.RawIngestTask")
321 # Import all the instrument classes so that we ensure that we
322 # have all the relevant metadata translators loaded.
323 Instrument.importAll(self.butler.registry)
325 def _reduce_kwargs(self) -> Dict[str, Any]:
326 # Add extra parameters to pickle.
327 return dict(
328 **super()._reduce_kwargs(),
329 butler=self.butler,
330 on_success=self._on_success,
331 on_metadata_failure=self._on_metadata_failure,
332 on_ingest_failure=self._on_ingest_failure,
333 )
335 def _determine_instrument_formatter(
336 self, dataId: DataCoordinate, filename: ResourcePath
337 ) -> Tuple[Optional[Instrument], Type[Formatter]]:
338 """Determine the instrument and formatter class.
340 Parameters
341 ----------
342 dataId : `lsst.daf.butler.DataCoordinate`
343 The dataId associated with this dataset.
344 filename : `lsst.resources.ResourcePath`
345 URI of file used for error reporting.
347 Returns
348 -------
349 instrument : `Instrument` or `None`
350 Instance of the `Instrument` associated with this dataset. `None`
351 indicates that the instrument could not be determined.
352 formatterClass : `type`
353 Class to be used as the formatter for this dataset.
354 """
355 # The data model currently assumes that whilst multiple datasets
356 # can be associated with a single file, they must all share the
357 # same formatter.
358 try:
359 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore
360 except LookupError as e:
361 self._on_metadata_failure(filename, e)
362 self.log.warning(
363 "Instrument %s for file %s not known to registry", dataId["instrument"], filename
364 )
365 if self.config.failFast:
366 raise RuntimeError(
367 f"Instrument {dataId['instrument']} for file {filename} not known to registry"
368 ) from e
369 FormatterClass = Formatter
370 # Indicate that we could not work out the instrument.
371 instrument = None
372 else:
373 assert instrument is not None, "Should be guaranted by fromName succeeding."
374 FormatterClass = instrument.getRawFormatter(dataId)
375 return instrument, FormatterClass
377 def extractMetadata(self, filename: ResourcePath) -> RawFileData:
378 """Extract and process metadata from a single raw file.
380 Parameters
381 ----------
382 filename : `lsst.resources.ResourcePath`
383 URI to the file.
385 Returns
386 -------
387 data : `RawFileData`
388 A structure containing the metadata extracted from the file,
389 as well as the original filename. All fields will be populated,
390 but the `RawFileData.dataId` attribute will be a minimal
391 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
392 ``instrument`` field will be `None` if there is a problem
393 with metadata extraction.
395 Notes
396 -----
397 Assumes that there is a single dataset associated with the given
398 file. Instruments using a single file to store multiple datasets
399 must implement their own version of this method.
401 By default the method will catch all exceptions unless the ``failFast``
402 configuration item is `True`. If an error is encountered the
403 `_on_metadata_failure()` method will be called. If no exceptions
404 result and an error was encountered the returned object will have
405 a null-instrument class and no datasets.
407 This method supports sidecar JSON files which can be used to
408 extract metadata without having to read the data file itself.
409 The sidecar file is always used if found.
410 """
411 sidecar_fail_msg = "" # Requires prepended space when set.
412 try:
413 sidecar_file = filename.updatedExtension(".json")
414 if sidecar_file.exists():
415 content = json.loads(sidecar_file.read())
416 headers = [process_sidecar_data(content)]
417 sidecar_fail_msg = " (via sidecar)"
418 else:
419 # Read the metadata from the data file itself.
421 # For remote files download the entire file to get the
422 # header. This is very inefficient and it would be better
423 # to have some way of knowing where in the file the headers
424 # are and to only download those parts of the file.
425 with filename.as_local() as local_file:
426 # Read the primary. This might be sufficient.
427 header = readMetadata(local_file.ospath, 0)
429 try:
430 # Try to work out a translator class early.
431 translator_class = MetadataTranslator.determine_translator(
432 header, filename=str(filename)
433 )
434 except ValueError:
435 # Primary header was not sufficient (maybe this file
436 # has been compressed or is a MEF with minimal
437 # primary). Read second header and merge with primary.
438 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
440 # Try again to work out a translator class, letting this
441 # fail.
442 translator_class = MetadataTranslator.determine_translator(header, filename=str(filename))
444 # Request the headers to use for ingest
445 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header))
447 # Add each header to the dataset list
448 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
450 except Exception as e:
451 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
452 # Indicate to the caller that we failed to read.
453 datasets = []
454 formatterClass = Formatter
455 instrument = None
456 self._on_metadata_failure(filename, e)
457 if self.config.failFast:
458 raise RuntimeError(
459 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}"
460 ) from e
461 else:
462 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
463 # The data model currently assumes that whilst multiple datasets
464 # can be associated with a single file, they must all share the
465 # same formatter.
466 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
467 if instrument is None:
468 datasets = []
470 return RawFileData(
471 datasets=datasets,
472 filename=filename,
473 # MyPy wants this to be a non-abstract class, which is not true
474 # for the error case where instrument is None and datasets=[].
475 FormatterClass=formatterClass, # type: ignore
476 instrument=instrument,
477 )
479 @classmethod
480 def getObservationInfoSubsets(cls) -> Tuple[Set, Set]:
481 """Return subsets of fields in the `ObservationInfo` that we care about
483 These fields will be used in constructing an exposure record.
485 Returns
486 -------
487 required : `set`
488 Set of `ObservationInfo` field names that are required.
489 optional : `set`
490 Set of `ObservationInfo` field names we will use if they are
491 available.
492 """
493 # Marking the new properties "group_counter_*" and
494 # "has_simulated_content" as required, assumes that we either
495 # recreate any existing index/sidecar files that include translated
496 # values, or else allow astro_metadata_translator to fill in
497 # defaults.
498 required = {
499 "datetime_begin",
500 "datetime_end",
501 "detector_num",
502 "exposure_id",
503 "exposure_time",
504 "group_counter_end",
505 "group_counter_start",
506 "has_simulated_content",
507 "instrument",
508 "observation_id",
509 "observation_type",
510 "physical_filter",
511 }
512 optional = {
513 "altaz_begin",
514 "boresight_rotation_coord",
515 "boresight_rotation_angle",
516 "dark_time",
517 "exposure_group",
518 "tracking_radec",
519 "object",
520 "observation_counter",
521 "observation_reason",
522 "observing_day",
523 "science_program",
524 "visit_id",
525 }
526 return required, optional
528 def _calculate_dataset_info(
529 self, header: Union[MutableMapping[str, Any], ObservationInfo], filename: ResourcePath
530 ) -> RawFileDatasetInfo:
531 """Calculate a RawFileDatasetInfo from the supplied information.
533 Parameters
534 ----------
535 header : Mapping or `astro_metadata_translator.ObservationInfo`
536 Header from the dataset or previously-translated content.
537 filename : `lsst.resources.ResourcePath`
538 Filename to use for error messages.
540 Returns
541 -------
542 dataset : `RawFileDatasetInfo`
543 The dataId, and observation information associated with this
544 dataset.
545 """
546 required, optional = self.getObservationInfoSubsets()
547 if isinstance(header, ObservationInfo):
548 obsInfo = header
549 missing = []
550 # Need to check the required properties are present.
551 for property in required:
552 # getattr does not need to be protected because it is using
553 # the defined list above containing properties that must exist.
554 value = getattr(obsInfo, property)
555 if value is None:
556 missing.append(property)
557 if missing:
558 raise ValueError(
559 f"Requested required properties are missing from file {filename}: {missing} (via JSON)"
560 )
562 else:
563 obsInfo = ObservationInfo(
564 header,
565 pedantic=False,
566 filename=str(filename),
567 required=required,
568 subset=required | optional,
569 )
571 dataId = DataCoordinate.standardize(
572 instrument=obsInfo.instrument,
573 exposure=obsInfo.exposure_id,
574 detector=obsInfo.detector_num,
575 universe=self.universe,
576 )
577 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
579 def locateAndReadIndexFiles(
580 self, files: Iterable[ResourcePath]
581 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]:
582 """Given a list of files, look for index files and read them.
584 Index files can either be explicitly in the list of files to
585 ingest, or else located in the same directory as a file to ingest.
586 Index entries are always used if present.
588 Parameters
589 ----------
590 files : iterable over `lsst.resources.ResourcePath`
591 URIs to the files to be ingested.
593 Returns
594 -------
595 index : `dict` [`ResourcePath`, Any]
596 Merged contents of all relevant index files found. These can
597 be explicitly specified index files or ones found in the
598 directory alongside a data file to be ingested.
599 updated_files : `list` of `ResourcePath`
600 Updated list of the input files with entries removed that were
601 found listed in an index file. Order is not guaranteed to
602 match the order of the files given to this routine.
603 good_index_files: `set` [ `ResourcePath` ]
604 Index files that were successfully read.
605 bad_index_files: `set` [ `ResourcePath` ]
606 Files that looked like index files but failed to read properly.
607 """
608 # Convert the paths to absolute for easy comparison with index content.
609 # Do not convert to real paths since we have to assume that index
610 # files are in this location and not the location which it links to.
611 files = tuple(f.abspath() for f in files)
613 # Index files must be named this.
614 index_root_file = "_index.json"
616 # Group the files by directory.
617 files_by_directory = defaultdict(set)
619 for path in files:
620 directory, file_in_dir = path.split()
621 files_by_directory[directory].add(file_in_dir)
623 # All the metadata read from index files with keys of full path.
624 index_entries: Dict[ResourcePath, Any] = {}
626 # Index files we failed to read.
627 bad_index_files = set()
629 # Any good index files that were found and used.
630 good_index_files = set()
632 # Look for index files in those directories.
633 for directory, files_in_directory in files_by_directory.items():
634 possible_index_file = directory.join(index_root_file)
635 if possible_index_file.exists():
636 # If we are explicitly requesting an index file the
637 # messages should be different.
638 index_msg = "inferred"
639 is_implied = True
640 if index_root_file in files_in_directory:
641 index_msg = "explicit"
642 is_implied = False
644 # Try to read the index file and catch and report any
645 # problems.
646 try:
647 content = json.loads(possible_index_file.read())
648 index = process_index_data(content, force_dict=True)
649 # mypy should in theory know that this is a mapping
650 # from the overload type annotation of process_index_data.
651 assert isinstance(index, MutableMapping)
652 except Exception as e:
653 # Only trigger the callback if the index file
654 # was asked for explicitly. Triggering on implied file
655 # might be surprising.
656 if not is_implied:
657 self._on_metadata_failure(possible_index_file, e)
658 if self.config.failFast:
659 raise RuntimeError(
660 f"Problem reading index file from {index_msg} location {possible_index_file}"
661 ) from e
662 bad_index_files.add(possible_index_file)
663 continue
665 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
666 good_index_files.add(possible_index_file)
668 # Go through the index adding entries for files.
669 # If we have non-index files in this directory marked for
670 # ingest we should only get index information for those.
671 # If the index file was explicit we use all entries.
672 if is_implied:
673 files_to_ingest = files_in_directory
674 else:
675 files_to_ingest = set(index)
677 # Copy relevant metadata into a single dict for all index
678 # entries.
679 for file_in_dir in files_to_ingest:
680 # Skip an explicitly specified index file.
681 # This should never happen because an explicit index
682 # file will force ingest of all files in the index
683 # and not use the explicit file list. If somehow
684 # this is not true we continue. Raising an exception
685 # seems like the wrong thing to do since this is harmless.
686 if file_in_dir == index_root_file:
687 self.log.info(
688 "Logic error found scanning directory %s. Please file ticket.", directory
689 )
690 continue
691 if file_in_dir in index:
692 file = directory.join(file_in_dir)
693 if file in index_entries:
694 # ObservationInfo overrides raw metadata
695 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance(
696 index_entries[file], ObservationInfo
697 ):
698 self.log.warning(
699 "File %s already specified in an index file but overriding"
700 " with ObservationInfo content from %s",
701 file,
702 possible_index_file,
703 )
704 else:
705 self.log.warning(
706 "File %s already specified in an index file, ignoring content from %s",
707 file,
708 possible_index_file,
709 )
710 # Do nothing in this case
711 continue
713 index_entries[file] = index[file_in_dir]
715 # Remove files from list that have index entries and also
716 # any files that we determined to be explicit index files
717 # or any index files that we failed to read.
718 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
720 # The filtered list loses the initial order. Retaining the order
721 # is good for testing but does have a cost if there are many
722 # files when copying the good values out. A dict would have faster
723 # lookups (using the files as keys) but use more memory.
724 ordered = [f for f in filtered if f in files]
726 return index_entries, ordered, good_index_files, bad_index_files
728 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]:
729 """Convert index entries to RawFileData.
731 Parameters
732 ----------
733 index_entries : `dict` [`ResourcePath`, Any]
734 Dict indexed by name of file to ingest and with keys either
735 raw metadata or translated
736 `~astro_metadata_translator.ObservationInfo`.
738 Returns
739 -------
740 data : `list` [ `RawFileData` ]
741 Structures containing the metadata extracted from the file,
742 as well as the original filename. All fields will be populated,
743 but the `RawFileData.dataId` attributes will be minimal
744 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances.
745 """
746 fileData = []
747 for filename, metadata in index_entries.items():
748 try:
749 datasets = [self._calculate_dataset_info(metadata, filename)]
750 except Exception as e:
751 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e)
752 datasets = []
753 formatterClass = Formatter
754 instrument = None
755 self._on_metadata_failure(filename, e)
756 if self.config.failFast:
757 raise RuntimeError(
758 f"Problem extracting metadata for file {filename} found in index file"
759 ) from e
760 else:
761 instrument, formatterClass = self._determine_instrument_formatter(
762 datasets[0].dataId, filename
763 )
764 if instrument is None:
765 datasets = []
766 fileData.append(
767 RawFileData(
768 datasets=datasets,
769 filename=filename,
770 # MyPy wants this to be a non-abstract class, which is not
771 # true for the error case where instrument is None and
772 # datasets=[].
773 FormatterClass=formatterClass, # type: ignore
774 instrument=instrument,
775 )
776 )
777 return fileData
779 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
780 """Group an iterable of `RawFileData` by exposure.
782 Parameters
783 ----------
784 files : iterable of `RawFileData`
785 File-level information to group.
787 Returns
788 -------
789 exposures : `list` of `RawExposureData`
790 A list of structures that group the file-level information by
791 exposure. All fields will be populated. The
792 `RawExposureData.dataId` attributes will be minimal (unexpanded)
793 `~lsst.daf.butler.DataCoordinate` instances.
794 """
795 exposureDimensions = self.universe["exposure"].graph
796 byExposure = defaultdict(list)
797 for f in files:
798 # Assume that the first dataset is representative for the file.
799 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
801 return [
802 RawExposureData(
803 dataId=dataId,
804 files=exposureFiles,
805 universe=self.universe,
806 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe),
807 dependencyRecords=self.makeDependencyRecords(
808 exposureFiles[0].datasets[0].obsInfo, self.universe
809 ),
810 )
811 for dataId, exposureFiles in byExposure.items()
812 ]
814 def makeExposureRecord(
815 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any
816 ) -> DimensionRecord:
817 """Construct a registry record for an exposure
819 This is a method that subclasses will often want to customize. This can
820 often be done by calling this base class implementation with additional
821 ``kwargs``.
823 Parameters
824 ----------
825 obsInfo : `ObservationInfo`
826 Observation details for (one of the components of) the exposure.
827 universe : `DimensionUniverse`
828 Set of all known dimensions.
829 **kwargs
830 Additional field values for this record.
832 Returns
833 -------
834 record : `DimensionRecord`
835 The exposure record that must be inserted into the
836 `~lsst.daf.butler.Registry` prior to file-level ingest.
837 """
838 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs)
840 def makeDependencyRecords(
841 self, obsInfo: ObservationInfo, universe: DimensionUniverse
842 ) -> Dict[str, DimensionRecord]:
843 """Construct dependency records
845 These dependency records will be inserted into the
846 `~lsst.daf.butler.Registry` before the exposure records, because they
847 are dependencies of the exposure. This allows an opportunity to satisfy
848 foreign key constraints that exist because of dimensions related to the
849 exposure.
851 This is a method that subclasses may want to customize, if they've
852 added dimensions that relate to an exposure.
854 Parameters
855 ----------
856 obsInfo : `ObservationInfo`
857 Observation details for (one of the components of) the exposure.
858 universe : `DimensionUniverse`
859 Set of all known dimensions.
861 Returns
862 -------
863 records : `dict` [`str`, `DimensionRecord`]
864 The records to insert, indexed by dimension name.
865 """
866 return {}
868 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
869 """Expand the data IDs associated with a raw exposure.
871 This adds the metadata records.
873 Parameters
874 ----------
875 exposure : `RawExposureData`
876 A structure containing information about the exposure to be
877 ingested. Must have `RawExposureData.record` populated. Should
878 be considered consumed upon return.
880 Returns
881 -------
882 exposure : `RawExposureData`
883 An updated version of the input structure, with
884 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
885 updated to data IDs for which
886 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
887 """
888 # We start by expanded the exposure-level data ID; we won't use that
889 # directly in file ingest, but this lets us do some database lookups
890 # once per exposure instead of once per file later.
891 data.dataId = self.butler.registry.expandDataId(
892 data.dataId,
893 # We pass in the records we'll be inserting shortly so they aren't
894 # looked up from the database. We do expect instrument and filter
895 # records to be retrieved from the database here (though the
896 # Registry may cache them so there isn't a lookup every time).
897 records={"exposure": data.record},
898 )
899 # Now we expand the per-file (exposure+detector) data IDs. This time
900 # we pass in the records we just retrieved from the exposure data ID
901 # expansion.
902 for file in data.files:
903 for dataset in file.datasets:
904 dataset.dataId = self.butler.registry.expandDataId(
905 dataset.dataId, records=data.dataId.records
906 )
907 return data
909 def prep(
910 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None, processes: int = 1
911 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]:
912 """Perform all non-database-updating ingest preprocessing steps.
914 Parameters
915 ----------
916 files : iterable over `str` or path-like objects
917 Paths to the files to be ingested. Will be made absolute
918 if they are not already.
919 pool : `multiprocessing.Pool`, optional
920 If not `None`, a process pool with which to parallelize some
921 operations.
922 processes : `int`, optional
923 The number of processes to use. Ignored if ``pool`` is not `None`.
925 Returns
926 -------
927 exposures : `Iterator` [ `RawExposureData` ]
928 Data structures containing dimension records, filenames, and data
929 IDs to be ingested (one structure for each exposure).
930 bad_files : `list` of `str`
931 List of all the files that could not have metadata extracted.
932 """
933 if pool is None and processes > 1:
934 pool = Pool(processes)
935 mapFunc = map if pool is None else pool.imap_unordered
937 def _partition_good_bad(
938 file_data: Iterable[RawFileData],
939 ) -> Tuple[List[RawFileData], List[ResourcePath]]:
940 """Filter out bad files and return good with list of bad."""
941 good_files = []
942 bad_files = []
943 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"):
944 if not fileDatum.datasets:
945 bad_files.append(fileDatum.filename)
946 else:
947 good_files.append(fileDatum)
948 return good_files, bad_files
950 # Look for index files and read them.
951 # There should be far fewer index files than data files.
952 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
953 if bad_index_files:
954 self.log.info("Failed to read the following explicitly requested index files:")
955 for bad in sorted(bad_index_files):
956 self.log.info("- %s", bad)
958 # Now convert all the index file entries to standard form for ingest.
959 processed_bad_index_files: List[ResourcePath] = []
960 indexFileData = self.processIndexEntries(index_entries)
961 if indexFileData:
962 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData)
963 self.log.info(
964 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s",
965 *_log_msg_counter(indexFileData),
966 *_log_msg_counter(good_index_files),
967 *_log_msg_counter(processed_bad_index_files),
968 )
970 # Extract metadata and build per-detector regions.
971 # This could run in a subprocess so collect all output
972 # before looking at failures.
973 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
975 # Filter out all the failed reads and store them for later
976 # reporting.
977 good_file_data, bad_files = _partition_good_bad(fileData)
978 self.log.info(
979 "Successfully extracted metadata from %d file%s with %d failure%s",
980 *_log_msg_counter(good_file_data),
981 *_log_msg_counter(bad_files),
982 )
984 # Combine with data from index files.
985 good_file_data.extend(indexFileData)
986 bad_files.extend(processed_bad_index_files)
987 bad_files.extend(bad_index_files)
989 # Use that metadata to group files (and extracted metadata) by
990 # exposure. Never parallelized because it's intrinsically a gather
991 # step.
992 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data)
994 # The next operation operates on RawExposureData instances (one at
995 # a time) in-place and then returns the modified instance. We call it
996 # as a pass-through instead of relying on the arguments we pass in to
997 # have been modified because in the parallel case those arguments are
998 # going to be pickled and unpickled, and I'm not certain
999 # multiprocessing is careful enough with that for output arguments to
1000 # work.
1002 # Expand the data IDs to include all dimension metadata; we need this
1003 # because we may need to generate path templates that rely on that
1004 # metadata.
1005 # This is the first step that involves actual database calls (but just
1006 # SELECTs), so if there's going to be a problem with connections vs.
1007 # multiple processes, or lock contention (in SQLite) slowing things
1008 # down, it'll happen here.
1009 return mapFunc(self.expandDataIds, exposureData), bad_files
1011 def ingestExposureDatasets(
1012 self,
1013 exposure: RawExposureData,
1014 datasetType: DatasetType,
1015 *,
1016 run: Optional[str] = None,
1017 skip_existing_exposures: bool = False,
1018 track_file_attrs: bool = True,
1019 ) -> List[FileDataset]:
1020 """Ingest all raw files in one exposure.
1022 Parameters
1023 ----------
1024 exposure : `RawExposureData`
1025 A structure containing information about the exposure to be
1026 ingested. Must have `RawExposureData.records` populated and all
1027 data ID attributes expanded.
1028 datasetType : `DatasetType`
1029 The dataset type associated with this exposure.
1030 run : `str`, optional
1031 Name of a RUN-type collection to write to, overriding
1032 ``self.butler.run``.
1033 skip_existing_exposures : `bool`, optional
1034 If `True` (`False` is default), skip raws that have already been
1035 ingested (i.e. raws for which we already have a dataset with the
1036 same data ID in the target collection, even if from another file).
1037 Note that this is much slower than just not passing
1038 already-ingested files as inputs, because we still need to read and
1039 process metadata to identify which exposures to search for. It
1040 also will not work reliably if multiple processes are attempting to
1041 ingest raws from the same exposure concurrently, in that different
1042 processes may still attempt to ingest the same raw and conflict,
1043 causing a failure that prevents other raws from the same exposure
1044 from being ingested.
1045 track_file_attrs : `bool`, optional
1046 Control whether file attributes such as the size or checksum should
1047 be tracked by the datastore. Whether this parameter is honored
1048 depends on the specific datastore implementation.
1050 Returns
1051 -------
1052 datasets : `list` of `lsst.daf.butler.FileDataset`
1053 Per-file structures identifying the files ingested and their
1054 dataset representation in the data repository.
1055 """
1056 if skip_existing_exposures:
1057 existing = {
1058 ref.dataId
1059 for ref in self.butler.registry.queryDatasets(
1060 datasetType,
1061 collections=[run],
1062 dataId=exposure.dataId,
1063 )
1064 }
1065 else:
1066 existing = set()
1067 datasets = []
1068 for file in exposure.files:
1069 with warnings.catch_warnings():
1070 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
1071 refs = [DatasetRef(datasetType, d.dataId) for d in file.datasets if d.dataId not in existing]
1072 if refs:
1073 datasets.append(
1074 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass)
1075 )
1077 # Raw files are preferentially ingested using a UUID derived from
1078 # the collection name and dataId.
1079 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN):
1080 mode = DatasetIdGenEnum.DATAID_TYPE_RUN
1081 else:
1082 mode = DatasetIdGenEnum.UNIQUE
1083 self.butler.ingest(
1084 *datasets,
1085 transfer=self.config.transfer,
1086 run=run,
1087 idGenerationMode=mode,
1088 record_validation_info=track_file_attrs,
1089 )
1090 return datasets
1092 def ingestFiles(
1093 self,
1094 files: Iterable[ResourcePath],
1095 *,
1096 pool: Optional[PoolType] = None,
1097 processes: int = 1,
1098 run: Optional[str] = None,
1099 skip_existing_exposures: bool = False,
1100 update_exposure_records: bool = False,
1101 track_file_attrs: bool = True,
1102 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]:
1103 """Ingest files into a Butler data repository.
1105 This creates any new exposure or visit Dimension entries needed to
1106 identify the ingested files, creates new Dataset entries in the
1107 Registry and finally ingests the files themselves into the Datastore.
1108 Any needed instrument, detector, and physical_filter Dimension entries
1109 must exist in the Registry before `run` is called.
1111 Parameters
1112 ----------
1113 files : iterable over `lsst.resources.ResourcePath`
1114 URIs to the files to be ingested.
1115 pool : `multiprocessing.Pool`, optional
1116 If not `None`, a process pool with which to parallelize some
1117 operations.
1118 processes : `int`, optional
1119 The number of processes to use. Ignored if ``pool`` is not `None`.
1120 run : `str`, optional
1121 Name of a RUN-type collection to write to, overriding
1122 the default derived from the instrument name.
1123 skip_existing_exposures : `bool`, optional
1124 If `True` (`False` is default), skip raws that have already been
1125 ingested (i.e. raws for which we already have a dataset with the
1126 same data ID in the target collection, even if from another file).
1127 Note that this is much slower than just not passing
1128 already-ingested files as inputs, because we still need to read and
1129 process metadata to identify which exposures to search for. It
1130 also will not work reliably if multiple processes are attempting to
1131 ingest raws from the same exposure concurrently, in that different
1132 processes may still attempt to ingest the same raw and conflict,
1133 causing a failure that prevents other raws from the same exposure
1134 from being ingested.
1135 update_exposure_records : `bool`, optional
1136 If `True` (`False` is default), update existing exposure records
1137 that conflict with the new ones instead of rejecting them. THIS IS
1138 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1139 KNOWN TO BE BAD. This should usually be combined with
1140 ``skip_existing_exposures=True``.
1141 track_file_attrs : `bool`, optional
1142 Control whether file attributes such as the size or checksum should
1143 be tracked by the datastore. Whether this parameter is honored
1144 depends on the specific datastore implentation.
1146 Returns
1147 -------
1148 refs : `list` of `lsst.daf.butler.DatasetRef`
1149 Dataset references for ingested raws.
1150 bad_files : `list` of `ResourcePath`
1151 Given paths that could not be ingested.
1152 n_exposures : `int`
1153 Number of exposures successfully ingested.
1154 n_exposures_failed : `int`
1155 Number of exposures that failed when inserting dimension data.
1156 n_ingests_failed : `int`
1157 Number of exposures that failed when ingesting raw datasets.
1158 """
1160 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
1162 # Up to this point, we haven't modified the data repository at all.
1163 # Now we finally do that, with one transaction per exposure. This is
1164 # not parallelized at present because the performance of this step is
1165 # limited by the database server. That may or may not change in the
1166 # future once we increase our usage of bulk inserts and reduce our
1167 # usage of savepoints; we've tried to get everything but the database
1168 # operations done in advance to reduce the time spent inside
1169 # transactions.
1170 refs = []
1171 runs = set()
1172 datasetTypes: dict[str, DatasetType] = {}
1173 n_exposures = 0
1174 n_exposures_failed = 0
1175 n_ingests_failed = 0
1176 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
1177 assert exposure.record is not None, "Should be guaranteed by prep()"
1178 self.log.debug(
1179 "Attempting to ingest %d file%s from exposure %s:%s",
1180 *_log_msg_counter(exposure.files),
1181 exposure.record.instrument,
1182 exposure.record.obs_id,
1183 )
1185 try:
1186 for name, record in exposure.dependencyRecords.items():
1187 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records)
1188 inserted_or_updated = self.butler.registry.syncDimensionData(
1189 "exposure",
1190 exposure.record,
1191 update=update_exposure_records,
1192 )
1193 except Exception as e:
1194 self._on_ingest_failure(exposure, e)
1195 n_exposures_failed += 1
1196 self.log.warning(
1197 "Exposure %s:%s could not be registered: %s",
1198 exposure.record.instrument,
1199 exposure.record.obs_id,
1200 e,
1201 )
1202 if self.config.failFast:
1203 raise e
1204 continue
1206 if isinstance(inserted_or_updated, dict):
1207 # Exposure is in the registry and we updated it, so
1208 # syncDimensionData returned a dict.
1209 self.log.info(
1210 "Exposure %s:%s was already present, but columns %s were updated.",
1211 exposure.record.instrument,
1212 exposure.record.obs_id,
1213 str(list(inserted_or_updated.keys())),
1214 )
1216 # Determine the instrument so we can work out the dataset type.
1217 instrument = exposure.files[0].instrument
1218 assert (
1219 instrument is not None
1220 ), "file should have been removed from this list by prep if instrument could not be found"
1222 if raw_definition := getattr(instrument, "raw_definition", None):
1223 datasetTypeName, dimensions, storageClass = raw_definition
1224 if not (datasetType := datasetTypes.get(datasetTypeName)):
1225 datasetType = DatasetType(
1226 datasetTypeName, dimensions, storageClass, universe=self.butler.registry.dimensions
1227 )
1228 else:
1229 datasetType = self.datasetType
1230 if datasetType.name not in datasetTypes:
1231 self.butler.registry.registerDatasetType(datasetType)
1232 datasetTypes[datasetType.name] = datasetType
1234 # Override default run if nothing specified explicitly.
1235 if run is None:
1236 this_run = instrument.makeDefaultRawIngestRunName()
1237 else:
1238 this_run = run
1239 if this_run not in runs:
1240 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
1241 runs.add(this_run)
1242 try:
1243 datasets_for_exposure = self.ingestExposureDatasets(
1244 exposure,
1245 datasetType=datasetType,
1246 run=this_run,
1247 skip_existing_exposures=skip_existing_exposures,
1248 track_file_attrs=track_file_attrs,
1249 )
1250 except Exception as e:
1251 self._on_ingest_failure(exposure, e)
1252 n_ingests_failed += 1
1253 self.log.warning("Failed to ingest the following for reason: %s", e)
1254 for f in exposure.files:
1255 self.log.warning("- %s", f.filename)
1256 if self.config.failFast:
1257 raise e
1258 continue
1259 else:
1260 self._on_success(datasets_for_exposure)
1261 for dataset in datasets_for_exposure:
1262 refs.extend(dataset.refs)
1264 # Success for this exposure.
1265 n_exposures += 1
1266 self.log.info(
1267 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id
1268 )
1270 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
1272 @timeMethod
1273 def run(
1274 self,
1275 files: Iterable[ResourcePathExpression],
1276 *,
1277 pool: Optional[PoolType] = None,
1278 processes: int = 1,
1279 run: Optional[str] = None,
1280 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b",
1281 group_files: bool = True,
1282 skip_existing_exposures: bool = False,
1283 update_exposure_records: bool = False,
1284 track_file_attrs: bool = True,
1285 ) -> List[DatasetRef]:
1286 """Ingest files into a Butler data repository.
1288 This creates any new exposure or visit Dimension entries needed to
1289 identify the ingested files, creates new Dataset entries in the
1290 Registry and finally ingests the files themselves into the Datastore.
1291 Any needed instrument, detector, and physical_filter Dimension entries
1292 must exist in the Registry before `run` is called.
1294 Parameters
1295 ----------
1296 files : iterable `lsst.resources.ResourcePath`, `str` or path-like
1297 Paths to the files to be ingested. Can refer to directories.
1298 Will be made absolute if they are not already.
1299 pool : `multiprocessing.Pool`, optional
1300 If not `None`, a process pool with which to parallelize some
1301 operations.
1302 processes : `int`, optional
1303 The number of processes to use. Ignored if ``pool`` is not `None`.
1304 run : `str`, optional
1305 Name of a RUN-type collection to write to, overriding
1306 the default derived from the instrument name.
1307 file_filter : `str` or `re.Pattern`, optional
1308 Pattern to use to discover files to ingest within directories.
1309 The default is to search for FITS files. The regex applies to
1310 files within the directory.
1311 group_files : `bool`, optional
1312 Group files by directory if they have been discovered in
1313 directories. Will not affect files explicitly provided.
1314 skip_existing_exposures : `bool`, optional
1315 If `True` (`False` is default), skip raws that have already been
1316 ingested (i.e. raws for which we already have a dataset with the
1317 same data ID in the target collection, even if from another file).
1318 Note that this is much slower than just not passing
1319 already-ingested files as inputs, because we still need to read and
1320 process metadata to identify which exposures to search for. It
1321 also will not work reliably if multiple processes are attempting to
1322 ingest raws from the same exposure concurrently, in that different
1323 processes may still attempt to ingest the same raw and conflict,
1324 causing a failure that prevents other raws from the same exposure
1325 from being ingested.
1326 update_exposure_records : `bool`, optional
1327 If `True` (`False` is default), update existing exposure records
1328 that conflict with the new ones instead of rejecting them. THIS IS
1329 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1330 KNOWN TO BE BAD. This should usually be combined with
1331 ``skip_existing_exposures=True``.
1332 track_file_attrs : `bool`, optional
1333 Control whether file attributes such as the size or checksum should
1334 be tracked by the datastore. Whether this parameter is honored
1335 depends on the specific datastore implentation.
1337 Returns
1338 -------
1339 refs : `list` of `lsst.daf.butler.DatasetRef`
1340 Dataset references for ingested raws.
1342 Notes
1343 -----
1344 This method inserts all datasets for an exposure within a transaction,
1345 guaranteeing that partial exposures are never ingested. The exposure
1346 dimension record is inserted with `Registry.syncDimensionData` first
1347 (in its own transaction), which inserts only if a record with the same
1348 primary key does not already exist. This allows different files within
1349 the same exposure to be ingested in different runs.
1350 """
1352 refs = []
1353 bad_files = []
1354 n_exposures = 0
1355 n_exposures_failed = 0
1356 n_ingests_failed = 0
1357 if group_files:
1358 for group in ResourcePath.findFileResources(files, file_filter, group_files):
1359 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1360 group,
1361 pool=pool,
1362 processes=processes,
1363 run=run,
1364 skip_existing_exposures=skip_existing_exposures,
1365 update_exposure_records=update_exposure_records,
1366 track_file_attrs=track_file_attrs,
1367 )
1368 refs.extend(new_refs)
1369 bad_files.extend(bad)
1370 n_exposures += n_exp
1371 n_exposures_failed += n_exp_fail
1372 n_ingests_failed += n_ingest_fail
1373 else:
1374 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1375 ResourcePath.findFileResources(files, file_filter, group_files),
1376 pool=pool,
1377 processes=processes,
1378 run=run,
1379 skip_existing_exposures=skip_existing_exposures,
1380 update_exposure_records=update_exposure_records,
1381 )
1383 had_failure = False
1385 if bad_files:
1386 had_failure = True
1387 self.log.warning("Could not extract observation metadata from the following:")
1388 for f in bad_files:
1389 self.log.warning("- %s", f)
1391 self.log.info(
1392 "Successfully processed data from %d exposure%s with %d failure%s from exposure"
1393 " registration and %d failure%s from file ingest.",
1394 *_log_msg_counter(n_exposures),
1395 *_log_msg_counter(n_exposures_failed),
1396 *_log_msg_counter(n_ingests_failed),
1397 )
1398 if n_exposures_failed > 0 or n_ingests_failed > 0:
1399 had_failure = True
1400 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1402 if had_failure:
1403 raise RuntimeError("Some failures encountered during ingestion")
1405 return refs