Coverage for python/lsst/obs/base/ingest.py: 17%
358 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-02 10:32 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-02 10:32 +0000
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from collections import defaultdict
28from collections.abc import Callable, Iterable, Iterator, MutableMapping, Sized
29from dataclasses import InitVar, dataclass
30from multiprocessing import Pool
31from typing import Any, ClassVar
33from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers
34from astro_metadata_translator.indexing import process_index_data, process_sidecar_data
35from lsst.afw.fits import readMetadata
36from lsst.daf.butler import (
37 Butler,
38 CollectionType,
39 DataCoordinate,
40 DatasetIdGenEnum,
41 DatasetRef,
42 DatasetType,
43 DimensionRecord,
44 DimensionUniverse,
45 FileDataset,
46 Formatter,
47 Progress,
48)
49from lsst.pex.config import ChoiceField, Config, Field
50from lsst.pipe.base import Instrument, Task
51from lsst.resources import ResourcePath, ResourcePathExpression
52from lsst.utils.timer import timeMethod
54from ._instrument import makeExposureRecordFromObsInfo
56# multiprocessing.Pool is actually a function, not a type, and the real type
57# isn't exposed, so we can't used it annotations, so we'll just punt on it via
58# this alias instead.
59PoolType = Any
62def _do_nothing(*args: Any, **kwargs: Any) -> None:
63 """Do nothing.
65 This is a function that accepts anything and does nothing.
66 For use as a default in callback arguments.
67 """
68 pass
71def _log_msg_counter(noun: int | Sized) -> tuple[int, str]:
72 """Count the iterable and return the count and plural modifier.
74 Parameters
75 ----------
76 noun : `Sized` or `int`
77 Thing to count. If given an integer it is assumed to be the count
78 to use to calculate modifier.
80 Returns
81 -------
82 num : `int`
83 Number of items found in ``noun``.
84 modifier : `str`
85 Character to add to the end of a string referring to these items
86 to indicate whether it was a single item or not. Returns empty
87 string if there is one item or "s" otherwise.
89 Examples
90 --------
91 .. code-block:: python
93 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
94 """
95 if isinstance(noun, int):
96 num = noun
97 else:
98 num = len(noun)
99 return num, "" if num == 1 else "s"
102@dataclass
103class RawFileDatasetInfo:
104 """Information about a single dataset within a raw file."""
106 dataId: DataCoordinate
107 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
109 obsInfo: ObservationInfo
110 """Standardized observation metadata extracted directly from the file
111 headers (`astro_metadata_translator.ObservationInfo`).
112 """
115@dataclass
116class RawFileData:
117 """Information about a single raw file, used during ingest."""
119 datasets: list[RawFileDatasetInfo]
120 """The information describing each dataset within this raw file.
121 (`list` of `RawFileDatasetInfo`)
122 """
124 filename: ResourcePath
125 """URI of the file this information was extracted from (`str`).
127 This is the path prior to ingest, not the path after ingest.
128 """
130 FormatterClass: type[Formatter]
131 """Formatter class that should be used to ingest this file (`type`; as
132 subclass of `~lsst.daf.butler.Formatter`).
133 """
135 instrument: Instrument | None
136 """The `Instrument` instance associated with this file. Can be `None`
137 if ``datasets`` is an empty list."""
140@dataclass
141class RawExposureData:
142 """Information about a complete raw exposure, used during ingest."""
144 dataId: DataCoordinate
145 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
146 """
148 files: list[RawFileData]
149 """List of structures containing file-level information.
150 """
152 universe: InitVar[DimensionUniverse]
153 """Set of all known dimensions.
154 """
156 record: DimensionRecord
157 """The exposure `DimensionRecord` that must be inserted into the
158 `~lsst.daf.butler.Registry` prior to file-level ingest
159 (`~lsst.daf.butler.DimensionRecord`).
160 """
162 dependencyRecords: dict[str, DimensionRecord]
163 """Additional records that must be inserted into the
164 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record``
165 (e.g., to satisfy foreign key constraints), indexed by the dimension name.
166 """
169def makeTransferChoiceField(
170 doc: str = "How to transfer files (None for no transfer).", default: str = "auto"
171) -> ChoiceField:
172 """Create a Config field with options for transferring data between repos.
174 The allowed options for the field are exactly those supported by
175 `lsst.daf.butler.Datastore.ingest`.
177 Parameters
178 ----------
179 doc : `str`
180 Documentation for the configuration field.
181 default : `str`, optional
182 Default transfer mode for the field.
184 Returns
185 -------
186 field : `lsst.pex.config.ChoiceField`
187 Configuration field.
188 """
189 return ChoiceField(
190 doc=doc,
191 dtype=str,
192 allowed={
193 "move": "move",
194 "copy": "copy",
195 "auto": "choice will depend on datastore",
196 "direct": "use URI to ingested file directly in datastore",
197 "link": "hard link falling back to symbolic link",
198 "hardlink": "hard link",
199 "symlink": "symbolic (soft) link",
200 "relsymlink": "relative symbolic link",
201 },
202 optional=True,
203 default=default,
204 )
207class RawIngestConfig(Config):
208 """Configuration class for RawIngestTask."""
210 transfer = makeTransferChoiceField()
211 failFast: Field[bool] = Field(
212 dtype=bool,
213 default=False,
214 doc="If True, stop ingest as soon as any problem is encountered with any file. "
215 "Otherwise problem files will be skipped and logged and a report issued at completion.",
216 )
219class RawIngestTask(Task):
220 """Driver Task for ingesting raw data into Gen3 Butler repositories.
222 Parameters
223 ----------
224 config : `RawIngestConfig`
225 Configuration for the task.
226 butler : `~lsst.daf.butler.Butler`
227 Writeable butler instance, with ``butler.run`` set to the appropriate
228 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
229 datasets.
230 on_success : `Callable`, optional
231 A callback invoked when all of the raws associated with an exposure
232 are ingested. Will be passed a list of `FileDataset` objects, each
233 containing one or more resolved `DatasetRef` objects. If this callback
234 raises it will interrupt the entire ingest process, even if
235 `RawIngestConfig.failFast` is `False`.
236 on_metadata_failure : `Callable`, optional
237 A callback invoked when a failure occurs trying to translate the
238 metadata for a file. Will be passed the URI and the exception, in
239 that order, as positional arguments. Guaranteed to be called in an
240 ``except`` block, allowing the callback to re-raise or replace (with
241 ``raise ... from``) to override the task's usual error handling (before
242 `RawIngestConfig.failFast` logic occurs).
243 on_ingest_failure : `Callable`, optional
244 A callback invoked when dimension record or dataset insertion into the
245 database fails for an exposure. Will be passed a `RawExposureData`
246 instance and the exception, in that order, as positional arguments.
247 Guaranteed to be called in an ``except`` block, allowing the callback
248 to re-raise or replace (with ``raise ... from``) to override the task's
249 usual error handling (before `RawIngestConfig.failFast` logic occurs).
250 **kwargs
251 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
252 constructor.
254 Notes
255 -----
256 Each instance of `RawIngestTask` writes to the same Butler. Each
257 invocation of `RawIngestTask.run` ingests a list of files.
258 """
260 ConfigClass: ClassVar[type[Config]] = RawIngestConfig
262 _DefaultName: ClassVar[str] = "ingest"
264 def getDatasetType(self) -> DatasetType:
265 """Return the default DatasetType of the datasets ingested by this
266 Task.
268 Returns
269 -------
270 datasetType : `DatasetType`
271 The default dataset type to use for the data being ingested. This
272 is only used if the relevant `~lsst.pipe.base.Instrument` does not
273 define an override.
274 """
275 return DatasetType(
276 "raw",
277 ("instrument", "detector", "exposure"),
278 "Exposure",
279 universe=self.butler.dimensions,
280 )
282 # Mypy can not determine that the config passed to super() is this type.
283 config: RawIngestConfig
285 def __init__(
286 self,
287 config: RawIngestConfig,
288 *,
289 butler: Butler,
290 on_success: Callable[[list[FileDataset]], Any] = _do_nothing,
291 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing,
292 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
293 **kwargs: Any,
294 ):
295 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
296 super().__init__(config, **kwargs)
297 self.butler = butler
298 self.universe = self.butler.dimensions
299 self.datasetType = self.getDatasetType()
300 self._on_success = on_success
301 self._on_metadata_failure = on_metadata_failure
302 self._on_ingest_failure = on_ingest_failure
303 self.progress = Progress("obs.base.RawIngestTask")
305 # Import all the instrument classes so that we ensure that we
306 # have all the relevant metadata translators loaded.
307 Instrument.importAll(self.butler.registry)
309 def _reduce_kwargs(self) -> dict[str, Any]:
310 # Add extra parameters to pickle.
311 return dict(
312 **super()._reduce_kwargs(),
313 butler=self.butler,
314 on_success=self._on_success,
315 on_metadata_failure=self._on_metadata_failure,
316 on_ingest_failure=self._on_ingest_failure,
317 )
319 def _determine_instrument_formatter(
320 self, dataId: DataCoordinate, filename: ResourcePath
321 ) -> tuple[Instrument | None, type[Formatter]]:
322 """Determine the instrument and formatter class.
324 Parameters
325 ----------
326 dataId : `lsst.daf.butler.DataCoordinate`
327 The dataId associated with this dataset.
328 filename : `lsst.resources.ResourcePath`
329 URI of file used for error reporting.
331 Returns
332 -------
333 instrument : `Instrument` or `None`
334 Instance of the `Instrument` associated with this dataset. `None`
335 indicates that the instrument could not be determined.
336 formatterClass : `type`
337 Class to be used as the formatter for this dataset.
338 """
339 # The data model currently assumes that whilst multiple datasets
340 # can be associated with a single file, they must all share the
341 # same formatter.
342 try:
343 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore
344 except LookupError as e:
345 self._on_metadata_failure(filename, e)
346 self.log.warning(
347 "Instrument %s for file %s not known to registry", dataId["instrument"], filename
348 )
349 if self.config.failFast:
350 raise RuntimeError(
351 f"Instrument {dataId['instrument']} for file {filename} not known to registry"
352 ) from e
353 FormatterClass = Formatter
354 # Indicate that we could not work out the instrument.
355 instrument = None
356 else:
357 assert instrument is not None, "Should be guaranted by fromName succeeding."
358 FormatterClass = instrument.getRawFormatter(dataId)
359 return instrument, FormatterClass
361 def extractMetadata(self, filename: ResourcePath) -> RawFileData:
362 """Extract and process metadata from a single raw file.
364 Parameters
365 ----------
366 filename : `lsst.resources.ResourcePath`
367 URI to the file.
369 Returns
370 -------
371 data : `RawFileData`
372 A structure containing the metadata extracted from the file,
373 as well as the original filename. All fields will be populated,
374 but the `RawFileData.dataId` attribute will be a minimal
375 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
376 ``instrument`` field will be `None` if there is a problem
377 with metadata extraction.
379 Notes
380 -----
381 Assumes that there is a single dataset associated with the given
382 file. Instruments using a single file to store multiple datasets
383 must implement their own version of this method.
385 By default the method will catch all exceptions unless the ``failFast``
386 configuration item is `True`. If an error is encountered the
387 `_on_metadata_failure()` method will be called. If no exceptions
388 result and an error was encountered the returned object will have
389 a null-instrument class and no datasets.
391 This method supports sidecar JSON files which can be used to
392 extract metadata without having to read the data file itself.
393 The sidecar file is always used if found.
394 """
395 sidecar_fail_msg = "" # Requires prepended space when set.
396 try:
397 sidecar_file = filename.updatedExtension(".json")
398 if sidecar_file.exists():
399 content = json.loads(sidecar_file.read())
400 headers = [process_sidecar_data(content)]
401 sidecar_fail_msg = " (via sidecar)"
402 else:
403 # Read the metadata from the data file itself.
405 # For remote files download the entire file to get the
406 # header. This is very inefficient and it would be better
407 # to have some way of knowing where in the file the headers
408 # are and to only download those parts of the file.
409 with filename.as_local() as local_file:
410 # Read the primary. This might be sufficient.
411 header = readMetadata(local_file.ospath, 0)
413 try:
414 # Try to work out a translator class early.
415 translator_class = MetadataTranslator.determine_translator(
416 header, filename=str(filename)
417 )
418 except ValueError:
419 # Primary header was not sufficient (maybe this file
420 # has been compressed or is a MEF with minimal
421 # primary). Read second header and merge with primary.
422 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
424 # Try again to work out a translator class, letting this
425 # fail.
426 translator_class = MetadataTranslator.determine_translator(header, filename=str(filename))
428 # Request the headers to use for ingest
429 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header))
431 # Add each header to the dataset list
432 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
434 except Exception as e:
435 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
436 # Indicate to the caller that we failed to read.
437 datasets = []
438 formatterClass = Formatter
439 instrument = None
440 self._on_metadata_failure(filename, e)
441 if self.config.failFast:
442 raise RuntimeError(
443 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}"
444 ) from e
445 else:
446 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
447 # The data model currently assumes that whilst multiple datasets
448 # can be associated with a single file, they must all share the
449 # same formatter.
450 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
451 if instrument is None:
452 datasets = []
454 return RawFileData(
455 datasets=datasets,
456 filename=filename,
457 # MyPy wants this to be a non-abstract class, which is not true
458 # for the error case where instrument is None and datasets=[].
459 FormatterClass=formatterClass, # type: ignore
460 instrument=instrument,
461 )
463 @classmethod
464 def getObservationInfoSubsets(cls) -> tuple[set, set]:
465 """Return subsets of fields in the `ObservationInfo` that we care
466 about.
468 These fields will be used in constructing an exposure record.
470 Returns
471 -------
472 required : `set`
473 Set of `ObservationInfo` field names that are required.
474 optional : `set`
475 Set of `ObservationInfo` field names we will use if they are
476 available.
477 """
478 # Marking the new properties "group_counter_*" and
479 # "has_simulated_content" as required, assumes that we either
480 # recreate any existing index/sidecar files that include translated
481 # values, or else allow astro_metadata_translator to fill in
482 # defaults.
483 required = {
484 "datetime_begin",
485 "datetime_end",
486 "detector_num",
487 "exposure_id",
488 "exposure_time",
489 "group_counter_end",
490 "group_counter_start",
491 "has_simulated_content",
492 "instrument",
493 "observation_id",
494 "observation_type",
495 "physical_filter",
496 }
497 optional = {
498 "altaz_begin",
499 "boresight_rotation_coord",
500 "boresight_rotation_angle",
501 "dark_time",
502 "exposure_group",
503 "tracking_radec",
504 "object",
505 "observation_counter",
506 "observation_reason",
507 "observing_day",
508 "science_program",
509 "visit_id",
510 }
511 return required, optional
513 def _calculate_dataset_info(
514 self, header: MutableMapping[str, Any] | ObservationInfo, filename: ResourcePath
515 ) -> RawFileDatasetInfo:
516 """Calculate a RawFileDatasetInfo from the supplied information.
518 Parameters
519 ----------
520 header : Mapping or `astro_metadata_translator.ObservationInfo`
521 Header from the dataset or previously-translated content.
522 filename : `lsst.resources.ResourcePath`
523 Filename to use for error messages.
525 Returns
526 -------
527 dataset : `RawFileDatasetInfo`
528 The dataId, and observation information associated with this
529 dataset.
530 """
531 required, optional = self.getObservationInfoSubsets()
532 if isinstance(header, ObservationInfo):
533 obsInfo = header
534 missing = []
535 # Need to check the required properties are present.
536 for property in required:
537 # getattr does not need to be protected because it is using
538 # the defined list above containing properties that must exist.
539 value = getattr(obsInfo, property)
540 if value is None:
541 missing.append(property)
542 if missing:
543 raise ValueError(
544 f"Requested required properties are missing from file {filename}: {missing} (via JSON)"
545 )
547 else:
548 obsInfo = ObservationInfo(
549 header,
550 pedantic=False,
551 filename=str(filename),
552 required=required,
553 subset=required | optional,
554 )
556 dataId = DataCoordinate.standardize(
557 instrument=obsInfo.instrument,
558 exposure=obsInfo.exposure_id,
559 detector=obsInfo.detector_num,
560 universe=self.universe,
561 )
562 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
564 def locateAndReadIndexFiles(
565 self, files: Iterable[ResourcePath]
566 ) -> tuple[dict[ResourcePath, Any], list[ResourcePath], set[ResourcePath], set[ResourcePath]]:
567 """Given a list of files, look for index files and read them.
569 Index files can either be explicitly in the list of files to
570 ingest, or else located in the same directory as a file to ingest.
571 Index entries are always used if present.
573 Parameters
574 ----------
575 files : iterable over `lsst.resources.ResourcePath`
576 URIs to the files to be ingested.
578 Returns
579 -------
580 index : `dict` [`ResourcePath`, Any]
581 Merged contents of all relevant index files found. These can
582 be explicitly specified index files or ones found in the
583 directory alongside a data file to be ingested.
584 updated_files : `list` of `ResourcePath`
585 Updated list of the input files with entries removed that were
586 found listed in an index file. Order is not guaranteed to
587 match the order of the files given to this routine.
588 good_index_files: `set` [ `ResourcePath` ]
589 Index files that were successfully read.
590 bad_index_files: `set` [ `ResourcePath` ]
591 Files that looked like index files but failed to read properly.
592 """
593 # Convert the paths to absolute for easy comparison with index content.
594 # Do not convert to real paths since we have to assume that index
595 # files are in this location and not the location which it links to.
596 files = tuple(f.abspath() for f in files)
598 # Index files must be named this.
599 index_root_file = "_index.json"
601 # Group the files by directory.
602 files_by_directory = defaultdict(set)
604 for path in files:
605 directory, file_in_dir = path.split()
606 files_by_directory[directory].add(file_in_dir)
608 # All the metadata read from index files with keys of full path.
609 index_entries: dict[ResourcePath, Any] = {}
611 # Index files we failed to read.
612 bad_index_files = set()
614 # Any good index files that were found and used.
615 good_index_files = set()
617 # Look for index files in those directories.
618 for directory, files_in_directory in files_by_directory.items():
619 possible_index_file = directory.join(index_root_file)
620 if possible_index_file.exists():
621 # If we are explicitly requesting an index file the
622 # messages should be different.
623 index_msg = "inferred"
624 is_implied = True
625 if index_root_file in files_in_directory:
626 index_msg = "explicit"
627 is_implied = False
629 # Try to read the index file and catch and report any
630 # problems.
631 try:
632 content = json.loads(possible_index_file.read())
633 index = process_index_data(content, force_dict=True)
634 # mypy should in theory know that this is a mapping
635 # from the overload type annotation of process_index_data.
636 assert isinstance(index, MutableMapping)
637 except Exception as e:
638 # Only trigger the callback if the index file
639 # was asked for explicitly. Triggering on implied file
640 # might be surprising.
641 if not is_implied:
642 self._on_metadata_failure(possible_index_file, e)
643 if self.config.failFast:
644 raise RuntimeError(
645 f"Problem reading index file from {index_msg} location {possible_index_file}"
646 ) from e
647 bad_index_files.add(possible_index_file)
648 continue
650 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
651 good_index_files.add(possible_index_file)
653 # Go through the index adding entries for files.
654 # If we have non-index files in this directory marked for
655 # ingest we should only get index information for those.
656 # If the index file was explicit we use all entries.
657 if is_implied:
658 files_to_ingest = files_in_directory
659 else:
660 files_to_ingest = set(index)
662 # Copy relevant metadata into a single dict for all index
663 # entries.
664 for file_in_dir in files_to_ingest:
665 # Skip an explicitly specified index file.
666 # This should never happen because an explicit index
667 # file will force ingest of all files in the index
668 # and not use the explicit file list. If somehow
669 # this is not true we continue. Raising an exception
670 # seems like the wrong thing to do since this is harmless.
671 if file_in_dir == index_root_file:
672 self.log.info(
673 "Logic error found scanning directory %s. Please file ticket.", directory
674 )
675 continue
676 if file_in_dir in index:
677 file = directory.join(file_in_dir)
678 if file in index_entries:
679 # ObservationInfo overrides raw metadata
680 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance(
681 index_entries[file], ObservationInfo
682 ):
683 self.log.warning(
684 "File %s already specified in an index file but overriding"
685 " with ObservationInfo content from %s",
686 file,
687 possible_index_file,
688 )
689 else:
690 self.log.warning(
691 "File %s already specified in an index file, ignoring content from %s",
692 file,
693 possible_index_file,
694 )
695 # Do nothing in this case
696 continue
698 index_entries[file] = index[file_in_dir]
700 # Remove files from list that have index entries and also
701 # any files that we determined to be explicit index files
702 # or any index files that we failed to read.
703 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
705 # The filtered list loses the initial order. Retaining the order
706 # is good for testing but does have a cost if there are many
707 # files when copying the good values out. A dict would have faster
708 # lookups (using the files as keys) but use more memory.
709 ordered = [f for f in filtered if f in files]
711 return index_entries, ordered, good_index_files, bad_index_files
713 def processIndexEntries(self, index_entries: dict[ResourcePath, Any]) -> list[RawFileData]:
714 """Convert index entries to RawFileData.
716 Parameters
717 ----------
718 index_entries : `dict` [`ResourcePath`, Any]
719 Dict indexed by name of file to ingest and with keys either
720 raw metadata or translated
721 `~astro_metadata_translator.ObservationInfo`.
723 Returns
724 -------
725 data : `list` [ `RawFileData` ]
726 Structures containing the metadata extracted from the file,
727 as well as the original filename. All fields will be populated,
728 but the `RawFileData.dataId` attributes will be minimal
729 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances.
730 """
731 fileData = []
732 for filename, metadata in index_entries.items():
733 try:
734 datasets = [self._calculate_dataset_info(metadata, filename)]
735 except Exception as e:
736 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e)
737 datasets = []
738 formatterClass = Formatter
739 instrument = None
740 self._on_metadata_failure(filename, e)
741 if self.config.failFast:
742 raise RuntimeError(
743 f"Problem extracting metadata for file {filename} found in index file"
744 ) from e
745 else:
746 instrument, formatterClass = self._determine_instrument_formatter(
747 datasets[0].dataId, filename
748 )
749 if instrument is None:
750 datasets = []
751 fileData.append(
752 RawFileData(
753 datasets=datasets,
754 filename=filename,
755 # MyPy wants this to be a non-abstract class, which is not
756 # true for the error case where instrument is None and
757 # datasets=[].
758 FormatterClass=formatterClass, # type: ignore
759 instrument=instrument,
760 )
761 )
762 return fileData
764 def groupByExposure(self, files: Iterable[RawFileData]) -> list[RawExposureData]:
765 """Group an iterable of `RawFileData` by exposure.
767 Parameters
768 ----------
769 files : iterable of `RawFileData`
770 File-level information to group.
772 Returns
773 -------
774 exposures : `list` of `RawExposureData`
775 A list of structures that group the file-level information by
776 exposure. All fields will be populated. The
777 `RawExposureData.dataId` attributes will be minimal (unexpanded)
778 `~lsst.daf.butler.DataCoordinate` instances.
779 """
780 exposureDimensions = self.universe["exposure"].graph
781 byExposure = defaultdict(list)
782 for f in files:
783 # Assume that the first dataset is representative for the file.
784 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
786 return [
787 RawExposureData(
788 dataId=dataId,
789 files=exposureFiles,
790 universe=self.universe,
791 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe),
792 dependencyRecords=self.makeDependencyRecords(
793 exposureFiles[0].datasets[0].obsInfo, self.universe
794 ),
795 )
796 for dataId, exposureFiles in byExposure.items()
797 ]
799 def makeExposureRecord(
800 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any
801 ) -> DimensionRecord:
802 """Construct a registry record for an exposure.
804 This is a method that subclasses will often want to customize. This can
805 often be done by calling this base class implementation with additional
806 ``kwargs``.
808 Parameters
809 ----------
810 obsInfo : `ObservationInfo`
811 Observation details for (one of the components of) the exposure.
812 universe : `DimensionUniverse`
813 Set of all known dimensions.
814 **kwargs
815 Additional field values for this record.
817 Returns
818 -------
819 record : `DimensionRecord`
820 The exposure record that must be inserted into the
821 `~lsst.daf.butler.Registry` prior to file-level ingest.
822 """
823 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs)
825 def makeDependencyRecords(
826 self, obsInfo: ObservationInfo, universe: DimensionUniverse
827 ) -> dict[str, DimensionRecord]:
828 """Construct dependency records.
830 These dependency records will be inserted into the
831 `~lsst.daf.butler.Registry` before the exposure records, because they
832 are dependencies of the exposure. This allows an opportunity to satisfy
833 foreign key constraints that exist because of dimensions related to the
834 exposure.
836 This is a method that subclasses may want to customize, if they've
837 added dimensions that relate to an exposure.
839 Parameters
840 ----------
841 obsInfo : `ObservationInfo`
842 Observation details for (one of the components of) the exposure.
843 universe : `DimensionUniverse`
844 Set of all known dimensions.
846 Returns
847 -------
848 records : `dict` [`str`, `DimensionRecord`]
849 The records to insert, indexed by dimension name.
850 """
851 return {}
853 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
854 """Expand the data IDs associated with a raw exposure.
856 This adds the metadata records.
858 Parameters
859 ----------
860 exposure : `RawExposureData`
861 A structure containing information about the exposure to be
862 ingested. Must have `RawExposureData.record` populated. Should
863 be considered consumed upon return.
865 Returns
866 -------
867 exposure : `RawExposureData`
868 An updated version of the input structure, with
869 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
870 updated to data IDs for which
871 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
872 """
873 # We start by expanded the exposure-level data ID; we won't use that
874 # directly in file ingest, but this lets us do some database lookups
875 # once per exposure instead of once per file later.
876 data.dataId = self.butler.registry.expandDataId(
877 data.dataId,
878 # We pass in the records we'll be inserting shortly so they aren't
879 # looked up from the database. We do expect instrument and filter
880 # records to be retrieved from the database here (though the
881 # Registry may cache them so there isn't a lookup every time).
882 records={"exposure": data.record},
883 )
884 # Now we expand the per-file (exposure+detector) data IDs. This time
885 # we pass in the records we just retrieved from the exposure data ID
886 # expansion.
887 for file in data.files:
888 for dataset in file.datasets:
889 dataset.dataId = self.butler.registry.expandDataId(
890 dataset.dataId, records=data.dataId.records
891 )
892 return data
894 def prep(
895 self, files: Iterable[ResourcePath], *, pool: PoolType | None = None
896 ) -> tuple[Iterator[RawExposureData], list[ResourcePath]]:
897 """Perform all non-database-updating ingest preprocessing steps.
899 Parameters
900 ----------
901 files : iterable over `str` or path-like objects
902 Paths to the files to be ingested. Will be made absolute
903 if they are not already.
904 pool : `multiprocessing.Pool`, optional
905 If not `None`, a process pool with which to parallelize some
906 operations.
908 Returns
909 -------
910 exposures : `Iterator` [ `RawExposureData` ]
911 Data structures containing dimension records, filenames, and data
912 IDs to be ingested (one structure for each exposure).
913 bad_files : `list` of `str`
914 List of all the files that could not have metadata extracted.
915 """
916 mapFunc = map if pool is None else pool.imap_unordered
918 def _partition_good_bad(
919 file_data: Iterable[RawFileData],
920 ) -> tuple[list[RawFileData], list[ResourcePath]]:
921 """Filter out bad files and return good with list of bad."""
922 good_files = []
923 bad_files = []
924 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"):
925 if not fileDatum.datasets:
926 bad_files.append(fileDatum.filename)
927 else:
928 good_files.append(fileDatum)
929 return good_files, bad_files
931 # Look for index files and read them.
932 # There should be far fewer index files than data files.
933 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
934 if bad_index_files:
935 self.log.info("Failed to read the following explicitly requested index files:")
936 for bad in sorted(bad_index_files):
937 self.log.info("- %s", bad)
939 # Now convert all the index file entries to standard form for ingest.
940 processed_bad_index_files: list[ResourcePath] = []
941 indexFileData = self.processIndexEntries(index_entries)
942 if indexFileData:
943 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData)
944 self.log.info(
945 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s",
946 *_log_msg_counter(indexFileData),
947 *_log_msg_counter(good_index_files),
948 *_log_msg_counter(processed_bad_index_files),
949 )
951 # Extract metadata and build per-detector regions.
952 # This could run in a subprocess so collect all output
953 # before looking at failures.
954 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
956 # Filter out all the failed reads and store them for later
957 # reporting.
958 good_file_data, bad_files = _partition_good_bad(fileData)
959 self.log.info(
960 "Successfully extracted metadata from %d file%s with %d failure%s",
961 *_log_msg_counter(good_file_data),
962 *_log_msg_counter(bad_files),
963 )
965 # Combine with data from index files.
966 good_file_data.extend(indexFileData)
967 bad_files.extend(processed_bad_index_files)
968 bad_files.extend(bad_index_files)
970 # Use that metadata to group files (and extracted metadata) by
971 # exposure. Never parallelized because it's intrinsically a gather
972 # step.
973 exposureData: list[RawExposureData] = self.groupByExposure(good_file_data)
975 # The next operation operates on RawExposureData instances (one at
976 # a time) in-place and then returns the modified instance. We call it
977 # as a pass-through instead of relying on the arguments we pass in to
978 # have been modified because in the parallel case those arguments are
979 # going to be pickled and unpickled, and I'm not certain
980 # multiprocessing is careful enough with that for output arguments to
981 # work.
983 # Expand the data IDs to include all dimension metadata; we need this
984 # because we may need to generate path templates that rely on that
985 # metadata.
986 # This is the first step that involves actual database calls (but just
987 # SELECTs), so if there's going to be a problem with connections vs.
988 # multiple processes, or lock contention (in SQLite) slowing things
989 # down, it'll happen here.
990 return mapFunc(self.expandDataIds, exposureData), bad_files
992 def ingestExposureDatasets(
993 self,
994 exposure: RawExposureData,
995 datasetType: DatasetType,
996 *,
997 run: str,
998 skip_existing_exposures: bool = False,
999 track_file_attrs: bool = True,
1000 ) -> list[FileDataset]:
1001 """Ingest all raw files in one exposure.
1003 Parameters
1004 ----------
1005 exposure : `RawExposureData`
1006 A structure containing information about the exposure to be
1007 ingested. Must have `RawExposureData.records` populated and all
1008 data ID attributes expanded.
1009 datasetType : `DatasetType`
1010 The dataset type associated with this exposure.
1011 run : `str`
1012 Name of a RUN-type collection to write to.
1013 skip_existing_exposures : `bool`, optional
1014 If `True` (`False` is default), skip raws that have already been
1015 ingested (i.e. raws for which we already have a dataset with the
1016 same data ID in the target collection, even if from another file).
1017 Note that this is much slower than just not passing
1018 already-ingested files as inputs, because we still need to read and
1019 process metadata to identify which exposures to search for. It
1020 also will not work reliably if multiple processes are attempting to
1021 ingest raws from the same exposure concurrently, in that different
1022 processes may still attempt to ingest the same raw and conflict,
1023 causing a failure that prevents other raws from the same exposure
1024 from being ingested.
1025 track_file_attrs : `bool`, optional
1026 Control whether file attributes such as the size or checksum should
1027 be tracked by the datastore. Whether this parameter is honored
1028 depends on the specific datastore implementation.
1030 Returns
1031 -------
1032 datasets : `list` of `lsst.daf.butler.FileDataset`
1033 Per-file structures identifying the files ingested and their
1034 dataset representation in the data repository.
1035 """
1036 if skip_existing_exposures:
1037 existing = {
1038 ref.dataId
1039 for ref in self.butler.registry.queryDatasets(
1040 datasetType,
1041 collections=[run],
1042 dataId=exposure.dataId,
1043 )
1044 }
1045 else:
1046 existing = set()
1048 # Raw files are preferentially ingested using a UUID derived from
1049 # the collection name and dataId.
1050 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN):
1051 mode = DatasetIdGenEnum.DATAID_TYPE_RUN
1052 else:
1053 mode = DatasetIdGenEnum.UNIQUE
1055 datasets = []
1056 for file in exposure.files:
1057 refs = [
1058 DatasetRef(datasetType, d.dataId, run=run, id_generation_mode=mode)
1059 for d in file.datasets
1060 if d.dataId not in existing
1061 ]
1062 if refs:
1063 datasets.append(
1064 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass)
1065 )
1067 self.butler.ingest(
1068 *datasets,
1069 transfer=self.config.transfer,
1070 record_validation_info=track_file_attrs,
1071 )
1072 return datasets
1074 def ingestFiles(
1075 self,
1076 files: Iterable[ResourcePath],
1077 *,
1078 pool: PoolType | None = None,
1079 processes: int = 1,
1080 run: str | None = None,
1081 skip_existing_exposures: bool = False,
1082 update_exposure_records: bool = False,
1083 track_file_attrs: bool = True,
1084 ) -> tuple[list[DatasetRef], list[ResourcePath], int, int, int]:
1085 """Ingest files into a Butler data repository.
1087 This creates any new exposure or visit Dimension entries needed to
1088 identify the ingested files, creates new Dataset entries in the
1089 Registry and finally ingests the files themselves into the Datastore.
1090 Any needed instrument, detector, and physical_filter Dimension entries
1091 must exist in the Registry before `run` is called.
1093 Parameters
1094 ----------
1095 files : iterable over `lsst.resources.ResourcePath`
1096 URIs to the files to be ingested.
1097 pool : `multiprocessing.Pool`, optional
1098 If not `None`, a process pool with which to parallelize some
1099 operations.
1100 processes : `int`, optional
1101 The number of processes to use. Ignored if ``pool`` is not `None`.
1102 run : `str`, optional
1103 Name of a RUN-type collection to write to, overriding
1104 the default derived from the instrument name.
1105 skip_existing_exposures : `bool`, optional
1106 If `True` (`False` is default), skip raws that have already been
1107 ingested (i.e. raws for which we already have a dataset with the
1108 same data ID in the target collection, even if from another file).
1109 Note that this is much slower than just not passing
1110 already-ingested files as inputs, because we still need to read and
1111 process metadata to identify which exposures to search for. It
1112 also will not work reliably if multiple processes are attempting to
1113 ingest raws from the same exposure concurrently, in that different
1114 processes may still attempt to ingest the same raw and conflict,
1115 causing a failure that prevents other raws from the same exposure
1116 from being ingested.
1117 update_exposure_records : `bool`, optional
1118 If `True` (`False` is default), update existing exposure records
1119 that conflict with the new ones instead of rejecting them. THIS IS
1120 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1121 KNOWN TO BE BAD. This should usually be combined with
1122 ``skip_existing_exposures=True``.
1123 track_file_attrs : `bool`, optional
1124 Control whether file attributes such as the size or checksum should
1125 be tracked by the datastore. Whether this parameter is honored
1126 depends on the specific datastore implementation.
1128 Returns
1129 -------
1130 refs : `list` of `lsst.daf.butler.DatasetRef`
1131 Dataset references for ingested raws.
1132 bad_files : `list` of `ResourcePath`
1133 Given paths that could not be ingested.
1134 n_exposures : `int`
1135 Number of exposures successfully ingested.
1136 n_exposures_failed : `int`
1137 Number of exposures that failed when inserting dimension data.
1138 n_ingests_failed : `int`
1139 Number of exposures that failed when ingesting raw datasets.
1140 """
1141 created_pool = False
1142 if pool is None and processes > 1:
1143 pool = Pool(processes)
1144 created_pool = True
1146 try:
1147 exposureData, bad_files = self.prep(files, pool=pool)
1148 finally:
1149 if created_pool and pool:
1150 # The pool is not needed any more so close it if we created
1151 # it to ensure we clean up resources.
1152 pool.close()
1153 pool.join()
1155 # Up to this point, we haven't modified the data repository at all.
1156 # Now we finally do that, with one transaction per exposure. This is
1157 # not parallelized at present because the performance of this step is
1158 # limited by the database server. That may or may not change in the
1159 # future once we increase our usage of bulk inserts and reduce our
1160 # usage of savepoints; we've tried to get everything but the database
1161 # operations done in advance to reduce the time spent inside
1162 # transactions.
1163 refs = []
1164 runs = set()
1165 datasetTypes: dict[str, DatasetType] = {}
1166 n_exposures = 0
1167 n_exposures_failed = 0
1168 n_ingests_failed = 0
1169 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
1170 assert exposure.record is not None, "Should be guaranteed by prep()"
1171 self.log.debug(
1172 "Attempting to ingest %d file%s from exposure %s:%s",
1173 *_log_msg_counter(exposure.files),
1174 exposure.record.instrument,
1175 exposure.record.obs_id,
1176 )
1178 try:
1179 for name, record in exposure.dependencyRecords.items():
1180 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records)
1181 inserted_or_updated = self.butler.registry.syncDimensionData(
1182 "exposure",
1183 exposure.record,
1184 update=update_exposure_records,
1185 )
1186 except Exception as e:
1187 self._on_ingest_failure(exposure, e)
1188 n_exposures_failed += 1
1189 self.log.warning(
1190 "Exposure %s:%s could not be registered: %s",
1191 exposure.record.instrument,
1192 exposure.record.obs_id,
1193 e,
1194 )
1195 if self.config.failFast:
1196 raise e
1197 continue
1199 if isinstance(inserted_or_updated, dict):
1200 # Exposure is in the registry and we updated it, so
1201 # syncDimensionData returned a dict.
1202 self.log.info(
1203 "Exposure %s:%s was already present, but columns %s were updated.",
1204 exposure.record.instrument,
1205 exposure.record.obs_id,
1206 str(list(inserted_or_updated.keys())),
1207 )
1209 # Determine the instrument so we can work out the dataset type.
1210 instrument = exposure.files[0].instrument
1211 assert (
1212 instrument is not None
1213 ), "file should have been removed from this list by prep if instrument could not be found"
1215 if raw_definition := getattr(instrument, "raw_definition", None):
1216 datasetTypeName, dimensions, storageClass = raw_definition
1217 if not (datasetType := datasetTypes.get(datasetTypeName)):
1218 datasetType = DatasetType(
1219 datasetTypeName, dimensions, storageClass, universe=self.butler.dimensions
1220 )
1221 else:
1222 datasetType = self.datasetType
1223 if datasetType.name not in datasetTypes:
1224 self.butler.registry.registerDatasetType(datasetType)
1225 datasetTypes[datasetType.name] = datasetType
1227 # Override default run if nothing specified explicitly.
1228 if run is None:
1229 this_run = instrument.makeDefaultRawIngestRunName()
1230 else:
1231 this_run = run
1232 if this_run not in runs:
1233 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
1234 runs.add(this_run)
1235 try:
1236 datasets_for_exposure = self.ingestExposureDatasets(
1237 exposure,
1238 datasetType=datasetType,
1239 run=this_run,
1240 skip_existing_exposures=skip_existing_exposures,
1241 track_file_attrs=track_file_attrs,
1242 )
1243 except Exception as e:
1244 self._on_ingest_failure(exposure, e)
1245 n_ingests_failed += 1
1246 self.log.warning("Failed to ingest the following for reason: %s", e)
1247 for f in exposure.files:
1248 self.log.warning("- %s", f.filename)
1249 if self.config.failFast:
1250 raise e
1251 continue
1252 else:
1253 self._on_success(datasets_for_exposure)
1254 for dataset in datasets_for_exposure:
1255 refs.extend(dataset.refs)
1257 # Success for this exposure.
1258 n_exposures += 1
1259 self.log.info(
1260 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id
1261 )
1263 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
1265 @timeMethod
1266 def run(
1267 self,
1268 files: Iterable[ResourcePathExpression],
1269 *,
1270 pool: PoolType | None = None,
1271 processes: int = 1,
1272 run: str | None = None,
1273 file_filter: str | re.Pattern = r"\.fit[s]?\b",
1274 group_files: bool = True,
1275 skip_existing_exposures: bool = False,
1276 update_exposure_records: bool = False,
1277 track_file_attrs: bool = True,
1278 ) -> list[DatasetRef]:
1279 """Ingest files into a Butler data repository.
1281 This creates any new exposure or visit Dimension entries needed to
1282 identify the ingested files, creates new Dataset entries in the
1283 Registry and finally ingests the files themselves into the Datastore.
1284 Any needed instrument, detector, and physical_filter Dimension entries
1285 must exist in the Registry before `run` is called.
1287 Parameters
1288 ----------
1289 files : iterable `lsst.resources.ResourcePath`, `str` or path-like
1290 Paths to the files to be ingested. Can refer to directories.
1291 Will be made absolute if they are not already.
1292 pool : `multiprocessing.Pool`, optional
1293 If not `None`, a process pool with which to parallelize some
1294 operations.
1295 processes : `int`, optional
1296 The number of processes to use. Ignored if ``pool`` is not `None`.
1297 run : `str`, optional
1298 Name of a RUN-type collection to write to, overriding
1299 the default derived from the instrument name.
1300 file_filter : `str` or `re.Pattern`, optional
1301 Pattern to use to discover files to ingest within directories.
1302 The default is to search for FITS files. The regex applies to
1303 files within the directory.
1304 group_files : `bool`, optional
1305 Group files by directory if they have been discovered in
1306 directories. Will not affect files explicitly provided.
1307 skip_existing_exposures : `bool`, optional
1308 If `True` (`False` is default), skip raws that have already been
1309 ingested (i.e. raws for which we already have a dataset with the
1310 same data ID in the target collection, even if from another file).
1311 Note that this is much slower than just not passing
1312 already-ingested files as inputs, because we still need to read and
1313 process metadata to identify which exposures to search for. It
1314 also will not work reliably if multiple processes are attempting to
1315 ingest raws from the same exposure concurrently, in that different
1316 processes may still attempt to ingest the same raw and conflict,
1317 causing a failure that prevents other raws from the same exposure
1318 from being ingested.
1319 update_exposure_records : `bool`, optional
1320 If `True` (`False` is default), update existing exposure records
1321 that conflict with the new ones instead of rejecting them. THIS IS
1322 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1323 KNOWN TO BE BAD. This should usually be combined with
1324 ``skip_existing_exposures=True``.
1325 track_file_attrs : `bool`, optional
1326 Control whether file attributes such as the size or checksum should
1327 be tracked by the datastore. Whether this parameter is honored
1328 depends on the specific datastore implementation.
1330 Returns
1331 -------
1332 refs : `list` of `lsst.daf.butler.DatasetRef`
1333 Dataset references for ingested raws.
1335 Notes
1336 -----
1337 This method inserts all datasets for an exposure within a transaction,
1338 guaranteeing that partial exposures are never ingested. The exposure
1339 dimension record is inserted with `Registry.syncDimensionData` first
1340 (in its own transaction), which inserts only if a record with the same
1341 primary key does not already exist. This allows different files within
1342 the same exposure to be ingested in different runs.
1343 """
1344 refs = []
1345 bad_files = []
1346 n_exposures = 0
1347 n_exposures_failed = 0
1348 n_ingests_failed = 0
1349 if group_files:
1350 for group in ResourcePath.findFileResources(files, file_filter, group_files):
1351 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1352 group,
1353 pool=pool,
1354 processes=processes,
1355 run=run,
1356 skip_existing_exposures=skip_existing_exposures,
1357 update_exposure_records=update_exposure_records,
1358 track_file_attrs=track_file_attrs,
1359 )
1360 refs.extend(new_refs)
1361 bad_files.extend(bad)
1362 n_exposures += n_exp
1363 n_exposures_failed += n_exp_fail
1364 n_ingests_failed += n_ingest_fail
1365 else:
1366 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1367 ResourcePath.findFileResources(files, file_filter, group_files),
1368 pool=pool,
1369 processes=processes,
1370 run=run,
1371 skip_existing_exposures=skip_existing_exposures,
1372 update_exposure_records=update_exposure_records,
1373 )
1375 had_failure = False
1377 if bad_files:
1378 had_failure = True
1379 self.log.warning("Could not extract observation metadata from the following:")
1380 for f in bad_files:
1381 self.log.warning("- %s", f)
1383 self.log.info(
1384 "Successfully processed data from %d exposure%s with %d failure%s from exposure"
1385 " registration and %d failure%s from file ingest.",
1386 *_log_msg_counter(n_exposures),
1387 *_log_msg_counter(n_exposures_failed),
1388 *_log_msg_counter(n_ingests_failed),
1389 )
1390 if n_exposures_failed > 0 or n_ingests_failed > 0:
1391 had_failure = True
1392 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1394 if had_failure:
1395 raise RuntimeError("Some failures encountered during ingestion")
1397 return refs