Coverage for python/lsst/obs/base/ingest.py: 17%
351 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-11 02:13 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-11 02:13 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from collections import defaultdict
28from dataclasses import InitVar, dataclass
29from multiprocessing import Pool
30from typing import (
31 Any,
32 Callable,
33 ClassVar,
34 Dict,
35 Iterable,
36 Iterator,
37 List,
38 MutableMapping,
39 Optional,
40 Set,
41 Sized,
42 Tuple,
43 Type,
44 Union,
45)
47from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers
48from astro_metadata_translator.indexing import process_index_data, process_sidecar_data
49from lsst.afw.fits import readMetadata
50from lsst.daf.butler import (
51 Butler,
52 CollectionType,
53 DataCoordinate,
54 DatasetIdGenEnum,
55 DatasetRef,
56 DatasetType,
57 DimensionRecord,
58 DimensionUniverse,
59 FileDataset,
60 Formatter,
61 Progress,
62)
63from lsst.pex.config import ChoiceField, Config, Field
64from lsst.pipe.base import Instrument, Task
65from lsst.resources import ResourcePath, ResourcePathExpression
66from lsst.utils.timer import timeMethod
68from ._instrument import makeExposureRecordFromObsInfo
70# multiprocessing.Pool is actually a function, not a type, and the real type
71# isn't exposed, so we can't used it annotations, so we'll just punt on it via
72# this alias instead.
73PoolType = Any
76def _do_nothing(*args: Any, **kwargs: Any) -> None:
77 """Do nothing.
79 This is a function that accepts anything and does nothing.
80 For use as a default in callback arguments.
81 """
82 pass
85def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]:
86 """Count the iterable and return the count and plural modifier.
88 Parameters
89 ----------
90 noun : `Sized` or `int`
91 Thing to count. If given an integer it is assumed to be the count
92 to use to calculate modifier.
94 Returns
95 -------
96 num : `int`
97 Number of items found in ``noun``.
98 modifier : `str`
99 Character to add to the end of a string referring to these items
100 to indicate whether it was a single item or not. Returns empty
101 string if there is one item or "s" otherwise.
103 Examples
104 --------
106 .. code-block:: python
108 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
109 """
110 if isinstance(noun, int):
111 num = noun
112 else:
113 num = len(noun)
114 return num, "" if num == 1 else "s"
117@dataclass
118class RawFileDatasetInfo:
119 """Information about a single dataset within a raw file."""
121 dataId: DataCoordinate
122 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
124 obsInfo: ObservationInfo
125 """Standardized observation metadata extracted directly from the file
126 headers (`astro_metadata_translator.ObservationInfo`).
127 """
130@dataclass
131class RawFileData:
132 """Information about a single raw file, used during ingest."""
134 datasets: List[RawFileDatasetInfo]
135 """The information describing each dataset within this raw file.
136 (`list` of `RawFileDatasetInfo`)
137 """
139 filename: ResourcePath
140 """URI of the file this information was extracted from (`str`).
142 This is the path prior to ingest, not the path after ingest.
143 """
145 FormatterClass: Type[Formatter]
146 """Formatter class that should be used to ingest this file (`type`; as
147 subclass of `Formatter`).
148 """
150 instrument: Optional[Instrument]
151 """The `Instrument` instance associated with this file. Can be `None`
152 if ``datasets`` is an empty list."""
155@dataclass
156class RawExposureData:
157 """Information about a complete raw exposure, used during ingest."""
159 dataId: DataCoordinate
160 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
161 """
163 files: List[RawFileData]
164 """List of structures containing file-level information.
165 """
167 universe: InitVar[DimensionUniverse]
168 """Set of all known dimensions.
169 """
171 record: DimensionRecord
172 """The exposure `DimensionRecord` that must be inserted into the
173 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
174 """
176 dependencyRecords: Dict[str, DimensionRecord]
177 """Additional records that must be inserted into the
178 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record``
179 (e.g., to satisfy foreign key constraints), indexed by the dimension name.
180 """
183def makeTransferChoiceField(
184 doc: str = "How to transfer files (None for no transfer).", default: str = "auto"
185) -> ChoiceField:
186 """Create a Config field with options for transferring data between repos.
188 The allowed options for the field are exactly those supported by
189 `lsst.daf.butler.Datastore.ingest`.
191 Parameters
192 ----------
193 doc : `str`
194 Documentation for the configuration field.
195 default : `str`, optional
196 Default transfer mode for the field.
198 Returns
199 -------
200 field : `lsst.pex.config.ChoiceField`
201 Configuration field.
202 """
203 return ChoiceField(
204 doc=doc,
205 dtype=str,
206 allowed={
207 "move": "move",
208 "copy": "copy",
209 "auto": "choice will depend on datastore",
210 "direct": "use URI to ingested file directly in datastore",
211 "link": "hard link falling back to symbolic link",
212 "hardlink": "hard link",
213 "symlink": "symbolic (soft) link",
214 "relsymlink": "relative symbolic link",
215 },
216 optional=True,
217 default=default,
218 )
221class RawIngestConfig(Config):
222 """Configuration class for RawIngestTask."""
224 transfer = makeTransferChoiceField()
225 failFast: Field[bool] = Field(
226 dtype=bool,
227 default=False,
228 doc="If True, stop ingest as soon as any problem is encountered with any file. "
229 "Otherwise problem files will be skipped and logged and a report issued at completion.",
230 )
233class RawIngestTask(Task):
234 """Driver Task for ingesting raw data into Gen3 Butler repositories.
236 Parameters
237 ----------
238 config : `RawIngestConfig`
239 Configuration for the task.
240 butler : `~lsst.daf.butler.Butler`
241 Writeable butler instance, with ``butler.run`` set to the appropriate
242 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
243 datasets.
244 on_success : `Callable`, optional
245 A callback invoked when all of the raws associated with an exposure
246 are ingested. Will be passed a list of `FileDataset` objects, each
247 containing one or more resolved `DatasetRef` objects. If this callback
248 raises it will interrupt the entire ingest process, even if
249 `RawIngestConfig.failFast` is `False`.
250 on_metadata_failure : `Callable`, optional
251 A callback invoked when a failure occurs trying to translate the
252 metadata for a file. Will be passed the URI and the exception, in
253 that order, as positional arguments. Guaranteed to be called in an
254 ``except`` block, allowing the callback to re-raise or replace (with
255 ``raise ... from``) to override the task's usual error handling (before
256 `RawIngestConfig.failFast` logic occurs).
257 on_ingest_failure : `Callable`, optional
258 A callback invoked when dimension record or dataset insertion into the
259 database fails for an exposure. Will be passed a `RawExposureData`
260 instance and the exception, in that order, as positional arguments.
261 Guaranteed to be called in an ``except`` block, allowing the callback
262 to re-raise or replace (with ``raise ... from``) to override the task's
263 usual error handling (before `RawIngestConfig.failFast` logic occurs).
264 **kwargs
265 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
266 constructor.
268 Notes
269 -----
270 Each instance of `RawIngestTask` writes to the same Butler. Each
271 invocation of `RawIngestTask.run` ingests a list of files.
272 """
274 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig
276 _DefaultName: ClassVar[str] = "ingest"
278 def getDatasetType(self) -> DatasetType:
279 """Return the default DatasetType of the datasets ingested by this
280 Task.
282 Returns
283 -------
284 datasetType : `DatasetType`
285 The default dataset type to use for the data being ingested. This
286 is only used if the relevant `~lsst.pipe.base.Instrument` does not
287 define an override.
288 """
289 return DatasetType(
290 "raw",
291 ("instrument", "detector", "exposure"),
292 "Exposure",
293 universe=self.butler.registry.dimensions,
294 )
296 # Mypy can not determine that the config passed to super() is this type.
297 config: RawIngestConfig
299 def __init__(
300 self,
301 config: RawIngestConfig,
302 *,
303 butler: Butler,
304 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
305 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing,
306 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
307 **kwargs: Any,
308 ):
309 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
310 super().__init__(config, **kwargs)
311 self.butler = butler
312 self.universe = self.butler.registry.dimensions
313 self.datasetType = self.getDatasetType()
314 self._on_success = on_success
315 self._on_metadata_failure = on_metadata_failure
316 self._on_ingest_failure = on_ingest_failure
317 self.progress = Progress("obs.base.RawIngestTask")
319 # Import all the instrument classes so that we ensure that we
320 # have all the relevant metadata translators loaded.
321 Instrument.importAll(self.butler.registry)
323 def _reduce_kwargs(self) -> Dict[str, Any]:
324 # Add extra parameters to pickle.
325 return dict(
326 **super()._reduce_kwargs(),
327 butler=self.butler,
328 on_success=self._on_success,
329 on_metadata_failure=self._on_metadata_failure,
330 on_ingest_failure=self._on_ingest_failure,
331 )
333 def _determine_instrument_formatter(
334 self, dataId: DataCoordinate, filename: ResourcePath
335 ) -> Tuple[Optional[Instrument], Type[Formatter]]:
336 """Determine the instrument and formatter class.
338 Parameters
339 ----------
340 dataId : `lsst.daf.butler.DataCoordinate`
341 The dataId associated with this dataset.
342 filename : `lsst.resources.ResourcePath`
343 URI of file used for error reporting.
345 Returns
346 -------
347 instrument : `Instrument` or `None`
348 Instance of the `Instrument` associated with this dataset. `None`
349 indicates that the instrument could not be determined.
350 formatterClass : `type`
351 Class to be used as the formatter for this dataset.
352 """
353 # The data model currently assumes that whilst multiple datasets
354 # can be associated with a single file, they must all share the
355 # same formatter.
356 try:
357 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore
358 except LookupError as e:
359 self._on_metadata_failure(filename, e)
360 self.log.warning(
361 "Instrument %s for file %s not known to registry", dataId["instrument"], filename
362 )
363 if self.config.failFast:
364 raise RuntimeError(
365 f"Instrument {dataId['instrument']} for file {filename} not known to registry"
366 ) from e
367 FormatterClass = Formatter
368 # Indicate that we could not work out the instrument.
369 instrument = None
370 else:
371 assert instrument is not None, "Should be guaranted by fromName succeeding."
372 FormatterClass = instrument.getRawFormatter(dataId)
373 return instrument, FormatterClass
375 def extractMetadata(self, filename: ResourcePath) -> RawFileData:
376 """Extract and process metadata from a single raw file.
378 Parameters
379 ----------
380 filename : `lsst.resources.ResourcePath`
381 URI to the file.
383 Returns
384 -------
385 data : `RawFileData`
386 A structure containing the metadata extracted from the file,
387 as well as the original filename. All fields will be populated,
388 but the `RawFileData.dataId` attribute will be a minimal
389 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
390 ``instrument`` field will be `None` if there is a problem
391 with metadata extraction.
393 Notes
394 -----
395 Assumes that there is a single dataset associated with the given
396 file. Instruments using a single file to store multiple datasets
397 must implement their own version of this method.
399 By default the method will catch all exceptions unless the ``failFast``
400 configuration item is `True`. If an error is encountered the
401 `_on_metadata_failure()` method will be called. If no exceptions
402 result and an error was encountered the returned object will have
403 a null-instrument class and no datasets.
405 This method supports sidecar JSON files which can be used to
406 extract metadata without having to read the data file itself.
407 The sidecar file is always used if found.
408 """
409 sidecar_fail_msg = "" # Requires prepended space when set.
410 try:
411 sidecar_file = filename.updatedExtension(".json")
412 if sidecar_file.exists():
413 content = json.loads(sidecar_file.read())
414 headers = [process_sidecar_data(content)]
415 sidecar_fail_msg = " (via sidecar)"
416 else:
417 # Read the metadata from the data file itself.
419 # For remote files download the entire file to get the
420 # header. This is very inefficient and it would be better
421 # to have some way of knowing where in the file the headers
422 # are and to only download those parts of the file.
423 with filename.as_local() as local_file:
424 # Read the primary. This might be sufficient.
425 header = readMetadata(local_file.ospath, 0)
427 try:
428 # Try to work out a translator class early.
429 translator_class = MetadataTranslator.determine_translator(
430 header, filename=str(filename)
431 )
432 except ValueError:
433 # Primary header was not sufficient (maybe this file
434 # has been compressed or is a MEF with minimal
435 # primary). Read second header and merge with primary.
436 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
438 # Try again to work out a translator class, letting this
439 # fail.
440 translator_class = MetadataTranslator.determine_translator(header, filename=str(filename))
442 # Request the headers to use for ingest
443 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header))
445 # Add each header to the dataset list
446 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
448 except Exception as e:
449 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
450 # Indicate to the caller that we failed to read.
451 datasets = []
452 formatterClass = Formatter
453 instrument = None
454 self._on_metadata_failure(filename, e)
455 if self.config.failFast:
456 raise RuntimeError(
457 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}"
458 ) from e
459 else:
460 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
461 # The data model currently assumes that whilst multiple datasets
462 # can be associated with a single file, they must all share the
463 # same formatter.
464 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
465 if instrument is None:
466 datasets = []
468 return RawFileData(
469 datasets=datasets,
470 filename=filename,
471 # MyPy wants this to be a non-abstract class, which is not true
472 # for the error case where instrument is None and datasets=[].
473 FormatterClass=formatterClass, # type: ignore
474 instrument=instrument,
475 )
477 @classmethod
478 def getObservationInfoSubsets(cls) -> Tuple[Set, Set]:
479 """Return subsets of fields in the `ObservationInfo` that we care about
481 These fields will be used in constructing an exposure record.
483 Returns
484 -------
485 required : `set`
486 Set of `ObservationInfo` field names that are required.
487 optional : `set`
488 Set of `ObservationInfo` field names we will use if they are
489 available.
490 """
491 # Marking the new properties "group_counter_*" and
492 # "has_simulated_content" as required, assumes that we either
493 # recreate any existing index/sidecar files that include translated
494 # values, or else allow astro_metadata_translator to fill in
495 # defaults.
496 required = {
497 "datetime_begin",
498 "datetime_end",
499 "detector_num",
500 "exposure_id",
501 "exposure_time",
502 "group_counter_end",
503 "group_counter_start",
504 "has_simulated_content",
505 "instrument",
506 "observation_id",
507 "observation_type",
508 "physical_filter",
509 }
510 optional = {
511 "altaz_begin",
512 "boresight_rotation_coord",
513 "boresight_rotation_angle",
514 "dark_time",
515 "exposure_group",
516 "tracking_radec",
517 "object",
518 "observation_counter",
519 "observation_reason",
520 "observing_day",
521 "science_program",
522 "visit_id",
523 }
524 return required, optional
526 def _calculate_dataset_info(
527 self, header: Union[MutableMapping[str, Any], ObservationInfo], filename: ResourcePath
528 ) -> RawFileDatasetInfo:
529 """Calculate a RawFileDatasetInfo from the supplied information.
531 Parameters
532 ----------
533 header : Mapping or `astro_metadata_translator.ObservationInfo`
534 Header from the dataset or previously-translated content.
535 filename : `lsst.resources.ResourcePath`
536 Filename to use for error messages.
538 Returns
539 -------
540 dataset : `RawFileDatasetInfo`
541 The dataId, and observation information associated with this
542 dataset.
543 """
544 required, optional = self.getObservationInfoSubsets()
545 if isinstance(header, ObservationInfo):
546 obsInfo = header
547 missing = []
548 # Need to check the required properties are present.
549 for property in required:
550 # getattr does not need to be protected because it is using
551 # the defined list above containing properties that must exist.
552 value = getattr(obsInfo, property)
553 if value is None:
554 missing.append(property)
555 if missing:
556 raise ValueError(
557 f"Requested required properties are missing from file {filename}: {missing} (via JSON)"
558 )
560 else:
561 obsInfo = ObservationInfo(
562 header,
563 pedantic=False,
564 filename=str(filename),
565 required=required,
566 subset=required | optional,
567 )
569 dataId = DataCoordinate.standardize(
570 instrument=obsInfo.instrument,
571 exposure=obsInfo.exposure_id,
572 detector=obsInfo.detector_num,
573 universe=self.universe,
574 )
575 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
577 def locateAndReadIndexFiles(
578 self, files: Iterable[ResourcePath]
579 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]:
580 """Given a list of files, look for index files and read them.
582 Index files can either be explicitly in the list of files to
583 ingest, or else located in the same directory as a file to ingest.
584 Index entries are always used if present.
586 Parameters
587 ----------
588 files : iterable over `lsst.resources.ResourcePath`
589 URIs to the files to be ingested.
591 Returns
592 -------
593 index : `dict` [`ResourcePath`, Any]
594 Merged contents of all relevant index files found. These can
595 be explicitly specified index files or ones found in the
596 directory alongside a data file to be ingested.
597 updated_files : `list` of `ResourcePath`
598 Updated list of the input files with entries removed that were
599 found listed in an index file. Order is not guaranteed to
600 match the order of the files given to this routine.
601 good_index_files: `set` [ `ResourcePath` ]
602 Index files that were successfully read.
603 bad_index_files: `set` [ `ResourcePath` ]
604 Files that looked like index files but failed to read properly.
605 """
606 # Convert the paths to absolute for easy comparison with index content.
607 # Do not convert to real paths since we have to assume that index
608 # files are in this location and not the location which it links to.
609 files = tuple(f.abspath() for f in files)
611 # Index files must be named this.
612 index_root_file = "_index.json"
614 # Group the files by directory.
615 files_by_directory = defaultdict(set)
617 for path in files:
618 directory, file_in_dir = path.split()
619 files_by_directory[directory].add(file_in_dir)
621 # All the metadata read from index files with keys of full path.
622 index_entries: Dict[ResourcePath, Any] = {}
624 # Index files we failed to read.
625 bad_index_files = set()
627 # Any good index files that were found and used.
628 good_index_files = set()
630 # Look for index files in those directories.
631 for directory, files_in_directory in files_by_directory.items():
632 possible_index_file = directory.join(index_root_file)
633 if possible_index_file.exists():
634 # If we are explicitly requesting an index file the
635 # messages should be different.
636 index_msg = "inferred"
637 is_implied = True
638 if index_root_file in files_in_directory:
639 index_msg = "explicit"
640 is_implied = False
642 # Try to read the index file and catch and report any
643 # problems.
644 try:
645 content = json.loads(possible_index_file.read())
646 index = process_index_data(content, force_dict=True)
647 # mypy should in theory know that this is a mapping
648 # from the overload type annotation of process_index_data.
649 assert isinstance(index, MutableMapping)
650 except Exception as e:
651 # Only trigger the callback if the index file
652 # was asked for explicitly. Triggering on implied file
653 # might be surprising.
654 if not is_implied:
655 self._on_metadata_failure(possible_index_file, e)
656 if self.config.failFast:
657 raise RuntimeError(
658 f"Problem reading index file from {index_msg} location {possible_index_file}"
659 ) from e
660 bad_index_files.add(possible_index_file)
661 continue
663 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
664 good_index_files.add(possible_index_file)
666 # Go through the index adding entries for files.
667 # If we have non-index files in this directory marked for
668 # ingest we should only get index information for those.
669 # If the index file was explicit we use all entries.
670 if is_implied:
671 files_to_ingest = files_in_directory
672 else:
673 files_to_ingest = set(index)
675 # Copy relevant metadata into a single dict for all index
676 # entries.
677 for file_in_dir in files_to_ingest:
678 # Skip an explicitly specified index file.
679 # This should never happen because an explicit index
680 # file will force ingest of all files in the index
681 # and not use the explicit file list. If somehow
682 # this is not true we continue. Raising an exception
683 # seems like the wrong thing to do since this is harmless.
684 if file_in_dir == index_root_file:
685 self.log.info(
686 "Logic error found scanning directory %s. Please file ticket.", directory
687 )
688 continue
689 if file_in_dir in index:
690 file = directory.join(file_in_dir)
691 if file in index_entries:
692 # ObservationInfo overrides raw metadata
693 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance(
694 index_entries[file], ObservationInfo
695 ):
696 self.log.warning(
697 "File %s already specified in an index file but overriding"
698 " with ObservationInfo content from %s",
699 file,
700 possible_index_file,
701 )
702 else:
703 self.log.warning(
704 "File %s already specified in an index file, ignoring content from %s",
705 file,
706 possible_index_file,
707 )
708 # Do nothing in this case
709 continue
711 index_entries[file] = index[file_in_dir]
713 # Remove files from list that have index entries and also
714 # any files that we determined to be explicit index files
715 # or any index files that we failed to read.
716 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
718 # The filtered list loses the initial order. Retaining the order
719 # is good for testing but does have a cost if there are many
720 # files when copying the good values out. A dict would have faster
721 # lookups (using the files as keys) but use more memory.
722 ordered = [f for f in filtered if f in files]
724 return index_entries, ordered, good_index_files, bad_index_files
726 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]:
727 """Convert index entries to RawFileData.
729 Parameters
730 ----------
731 index_entries : `dict` [`ResourcePath`, Any]
732 Dict indexed by name of file to ingest and with keys either
733 raw metadata or translated
734 `~astro_metadata_translator.ObservationInfo`.
736 Returns
737 -------
738 data : `list` [ `RawFileData` ]
739 Structures containing the metadata extracted from the file,
740 as well as the original filename. All fields will be populated,
741 but the `RawFileData.dataId` attributes will be minimal
742 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances.
743 """
744 fileData = []
745 for filename, metadata in index_entries.items():
746 try:
747 datasets = [self._calculate_dataset_info(metadata, filename)]
748 except Exception as e:
749 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e)
750 datasets = []
751 formatterClass = Formatter
752 instrument = None
753 self._on_metadata_failure(filename, e)
754 if self.config.failFast:
755 raise RuntimeError(
756 f"Problem extracting metadata for file {filename} found in index file"
757 ) from e
758 else:
759 instrument, formatterClass = self._determine_instrument_formatter(
760 datasets[0].dataId, filename
761 )
762 if instrument is None:
763 datasets = []
764 fileData.append(
765 RawFileData(
766 datasets=datasets,
767 filename=filename,
768 # MyPy wants this to be a non-abstract class, which is not
769 # true for the error case where instrument is None and
770 # datasets=[].
771 FormatterClass=formatterClass, # type: ignore
772 instrument=instrument,
773 )
774 )
775 return fileData
777 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
778 """Group an iterable of `RawFileData` by exposure.
780 Parameters
781 ----------
782 files : iterable of `RawFileData`
783 File-level information to group.
785 Returns
786 -------
787 exposures : `list` of `RawExposureData`
788 A list of structures that group the file-level information by
789 exposure. All fields will be populated. The
790 `RawExposureData.dataId` attributes will be minimal (unexpanded)
791 `~lsst.daf.butler.DataCoordinate` instances.
792 """
793 exposureDimensions = self.universe["exposure"].graph
794 byExposure = defaultdict(list)
795 for f in files:
796 # Assume that the first dataset is representative for the file.
797 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
799 return [
800 RawExposureData(
801 dataId=dataId,
802 files=exposureFiles,
803 universe=self.universe,
804 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe),
805 dependencyRecords=self.makeDependencyRecords(
806 exposureFiles[0].datasets[0].obsInfo, self.universe
807 ),
808 )
809 for dataId, exposureFiles in byExposure.items()
810 ]
812 def makeExposureRecord(
813 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any
814 ) -> DimensionRecord:
815 """Construct a registry record for an exposure
817 This is a method that subclasses will often want to customize. This can
818 often be done by calling this base class implementation with additional
819 ``kwargs``.
821 Parameters
822 ----------
823 obsInfo : `ObservationInfo`
824 Observation details for (one of the components of) the exposure.
825 universe : `DimensionUniverse`
826 Set of all known dimensions.
827 **kwargs
828 Additional field values for this record.
830 Returns
831 -------
832 record : `DimensionRecord`
833 The exposure record that must be inserted into the
834 `~lsst.daf.butler.Registry` prior to file-level ingest.
835 """
836 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs)
838 def makeDependencyRecords(
839 self, obsInfo: ObservationInfo, universe: DimensionUniverse
840 ) -> Dict[str, DimensionRecord]:
841 """Construct dependency records
843 These dependency records will be inserted into the
844 `~lsst.daf.butler.Registry` before the exposure records, because they
845 are dependencies of the exposure. This allows an opportunity to satisfy
846 foreign key constraints that exist because of dimensions related to the
847 exposure.
849 This is a method that subclasses may want to customize, if they've
850 added dimensions that relate to an exposure.
852 Parameters
853 ----------
854 obsInfo : `ObservationInfo`
855 Observation details for (one of the components of) the exposure.
856 universe : `DimensionUniverse`
857 Set of all known dimensions.
859 Returns
860 -------
861 records : `dict` [`str`, `DimensionRecord`]
862 The records to insert, indexed by dimension name.
863 """
864 return {}
866 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
867 """Expand the data IDs associated with a raw exposure.
869 This adds the metadata records.
871 Parameters
872 ----------
873 exposure : `RawExposureData`
874 A structure containing information about the exposure to be
875 ingested. Must have `RawExposureData.record` populated. Should
876 be considered consumed upon return.
878 Returns
879 -------
880 exposure : `RawExposureData`
881 An updated version of the input structure, with
882 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
883 updated to data IDs for which
884 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
885 """
886 # We start by expanded the exposure-level data ID; we won't use that
887 # directly in file ingest, but this lets us do some database lookups
888 # once per exposure instead of once per file later.
889 data.dataId = self.butler.registry.expandDataId(
890 data.dataId,
891 # We pass in the records we'll be inserting shortly so they aren't
892 # looked up from the database. We do expect instrument and filter
893 # records to be retrieved from the database here (though the
894 # Registry may cache them so there isn't a lookup every time).
895 records={"exposure": data.record},
896 )
897 # Now we expand the per-file (exposure+detector) data IDs. This time
898 # we pass in the records we just retrieved from the exposure data ID
899 # expansion.
900 for file in data.files:
901 for dataset in file.datasets:
902 dataset.dataId = self.butler.registry.expandDataId(
903 dataset.dataId, records=data.dataId.records
904 )
905 return data
907 def prep(
908 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None, processes: int = 1
909 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]:
910 """Perform all non-database-updating ingest preprocessing steps.
912 Parameters
913 ----------
914 files : iterable over `str` or path-like objects
915 Paths to the files to be ingested. Will be made absolute
916 if they are not already.
917 pool : `multiprocessing.Pool`, optional
918 If not `None`, a process pool with which to parallelize some
919 operations.
920 processes : `int`, optional
921 The number of processes to use. Ignored if ``pool`` is not `None`.
923 Returns
924 -------
925 exposures : `Iterator` [ `RawExposureData` ]
926 Data structures containing dimension records, filenames, and data
927 IDs to be ingested (one structure for each exposure).
928 bad_files : `list` of `str`
929 List of all the files that could not have metadata extracted.
930 """
931 if pool is None and processes > 1:
932 pool = Pool(processes)
933 mapFunc = map if pool is None else pool.imap_unordered
935 def _partition_good_bad(
936 file_data: Iterable[RawFileData],
937 ) -> Tuple[List[RawFileData], List[ResourcePath]]:
938 """Filter out bad files and return good with list of bad."""
939 good_files = []
940 bad_files = []
941 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"):
942 if not fileDatum.datasets:
943 bad_files.append(fileDatum.filename)
944 else:
945 good_files.append(fileDatum)
946 return good_files, bad_files
948 # Look for index files and read them.
949 # There should be far fewer index files than data files.
950 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
951 if bad_index_files:
952 self.log.info("Failed to read the following explicitly requested index files:")
953 for bad in sorted(bad_index_files):
954 self.log.info("- %s", bad)
956 # Now convert all the index file entries to standard form for ingest.
957 processed_bad_index_files: List[ResourcePath] = []
958 indexFileData = self.processIndexEntries(index_entries)
959 if indexFileData:
960 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData)
961 self.log.info(
962 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s",
963 *_log_msg_counter(indexFileData),
964 *_log_msg_counter(good_index_files),
965 *_log_msg_counter(processed_bad_index_files),
966 )
968 # Extract metadata and build per-detector regions.
969 # This could run in a subprocess so collect all output
970 # before looking at failures.
971 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
973 # Filter out all the failed reads and store them for later
974 # reporting.
975 good_file_data, bad_files = _partition_good_bad(fileData)
976 self.log.info(
977 "Successfully extracted metadata from %d file%s with %d failure%s",
978 *_log_msg_counter(good_file_data),
979 *_log_msg_counter(bad_files),
980 )
982 # Combine with data from index files.
983 good_file_data.extend(indexFileData)
984 bad_files.extend(processed_bad_index_files)
985 bad_files.extend(bad_index_files)
987 # Use that metadata to group files (and extracted metadata) by
988 # exposure. Never parallelized because it's intrinsically a gather
989 # step.
990 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data)
992 # The next operation operates on RawExposureData instances (one at
993 # a time) in-place and then returns the modified instance. We call it
994 # as a pass-through instead of relying on the arguments we pass in to
995 # have been modified because in the parallel case those arguments are
996 # going to be pickled and unpickled, and I'm not certain
997 # multiprocessing is careful enough with that for output arguments to
998 # work.
1000 # Expand the data IDs to include all dimension metadata; we need this
1001 # because we may need to generate path templates that rely on that
1002 # metadata.
1003 # This is the first step that involves actual database calls (but just
1004 # SELECTs), so if there's going to be a problem with connections vs.
1005 # multiple processes, or lock contention (in SQLite) slowing things
1006 # down, it'll happen here.
1007 return mapFunc(self.expandDataIds, exposureData), bad_files
1009 def ingestExposureDatasets(
1010 self,
1011 exposure: RawExposureData,
1012 datasetType: DatasetType,
1013 *,
1014 run: Optional[str] = None,
1015 skip_existing_exposures: bool = False,
1016 track_file_attrs: bool = True,
1017 ) -> List[FileDataset]:
1018 """Ingest all raw files in one exposure.
1020 Parameters
1021 ----------
1022 exposure : `RawExposureData`
1023 A structure containing information about the exposure to be
1024 ingested. Must have `RawExposureData.records` populated and all
1025 data ID attributes expanded.
1026 datasetType : `DatasetType`
1027 The dataset type associated with this exposure.
1028 run : `str`, optional
1029 Name of a RUN-type collection to write to, overriding
1030 ``self.butler.run``.
1031 skip_existing_exposures : `bool`, optional
1032 If `True` (`False` is default), skip raws that have already been
1033 ingested (i.e. raws for which we already have a dataset with the
1034 same data ID in the target collection, even if from another file).
1035 Note that this is much slower than just not passing
1036 already-ingested files as inputs, because we still need to read and
1037 process metadata to identify which exposures to search for. It
1038 also will not work reliably if multiple processes are attempting to
1039 ingest raws from the same exposure concurrently, in that different
1040 processes may still attempt to ingest the same raw and conflict,
1041 causing a failure that prevents other raws from the same exposure
1042 from being ingested.
1043 track_file_attrs : `bool`, optional
1044 Control whether file attributes such as the size or checksum should
1045 be tracked by the datastore. Whether this parameter is honored
1046 depends on the specific datastore implementation.
1048 Returns
1049 -------
1050 datasets : `list` of `lsst.daf.butler.FileDataset`
1051 Per-file structures identifying the files ingested and their
1052 dataset representation in the data repository.
1053 """
1054 if skip_existing_exposures:
1055 existing = {
1056 ref.dataId
1057 for ref in self.butler.registry.queryDatasets(
1058 datasetType,
1059 collections=[run],
1060 dataId=exposure.dataId,
1061 )
1062 }
1063 else:
1064 existing = set()
1065 datasets = []
1066 for file in exposure.files:
1067 refs = [DatasetRef(datasetType, d.dataId) for d in file.datasets if d.dataId not in existing]
1068 if refs:
1069 datasets.append(
1070 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass)
1071 )
1073 # Raw files are preferentially ingested using a UUID derived from
1074 # the collection name and dataId.
1075 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN):
1076 mode = DatasetIdGenEnum.DATAID_TYPE_RUN
1077 else:
1078 mode = DatasetIdGenEnum.UNIQUE
1079 self.butler.ingest(
1080 *datasets,
1081 transfer=self.config.transfer,
1082 run=run,
1083 idGenerationMode=mode,
1084 record_validation_info=track_file_attrs,
1085 )
1086 return datasets
1088 def ingestFiles(
1089 self,
1090 files: Iterable[ResourcePath],
1091 *,
1092 pool: Optional[PoolType] = None,
1093 processes: int = 1,
1094 run: Optional[str] = None,
1095 skip_existing_exposures: bool = False,
1096 update_exposure_records: bool = False,
1097 track_file_attrs: bool = True,
1098 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]:
1099 """Ingest files into a Butler data repository.
1101 This creates any new exposure or visit Dimension entries needed to
1102 identify the ingested files, creates new Dataset entries in the
1103 Registry and finally ingests the files themselves into the Datastore.
1104 Any needed instrument, detector, and physical_filter Dimension entries
1105 must exist in the Registry before `run` is called.
1107 Parameters
1108 ----------
1109 files : iterable over `lsst.resources.ResourcePath`
1110 URIs to the files to be ingested.
1111 pool : `multiprocessing.Pool`, optional
1112 If not `None`, a process pool with which to parallelize some
1113 operations.
1114 processes : `int`, optional
1115 The number of processes to use. Ignored if ``pool`` is not `None`.
1116 run : `str`, optional
1117 Name of a RUN-type collection to write to, overriding
1118 the default derived from the instrument name.
1119 skip_existing_exposures : `bool`, optional
1120 If `True` (`False` is default), skip raws that have already been
1121 ingested (i.e. raws for which we already have a dataset with the
1122 same data ID in the target collection, even if from another file).
1123 Note that this is much slower than just not passing
1124 already-ingested files as inputs, because we still need to read and
1125 process metadata to identify which exposures to search for. It
1126 also will not work reliably if multiple processes are attempting to
1127 ingest raws from the same exposure concurrently, in that different
1128 processes may still attempt to ingest the same raw and conflict,
1129 causing a failure that prevents other raws from the same exposure
1130 from being ingested.
1131 update_exposure_records : `bool`, optional
1132 If `True` (`False` is default), update existing exposure records
1133 that conflict with the new ones instead of rejecting them. THIS IS
1134 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1135 KNOWN TO BE BAD. This should usually be combined with
1136 ``skip_existing_exposures=True``.
1137 track_file_attrs : `bool`, optional
1138 Control whether file attributes such as the size or checksum should
1139 be tracked by the datastore. Whether this parameter is honored
1140 depends on the specific datastore implentation.
1142 Returns
1143 -------
1144 refs : `list` of `lsst.daf.butler.DatasetRef`
1145 Dataset references for ingested raws.
1146 bad_files : `list` of `ResourcePath`
1147 Given paths that could not be ingested.
1148 n_exposures : `int`
1149 Number of exposures successfully ingested.
1150 n_exposures_failed : `int`
1151 Number of exposures that failed when inserting dimension data.
1152 n_ingests_failed : `int`
1153 Number of exposures that failed when ingesting raw datasets.
1154 """
1156 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
1158 # Up to this point, we haven't modified the data repository at all.
1159 # Now we finally do that, with one transaction per exposure. This is
1160 # not parallelized at present because the performance of this step is
1161 # limited by the database server. That may or may not change in the
1162 # future once we increase our usage of bulk inserts and reduce our
1163 # usage of savepoints; we've tried to get everything but the database
1164 # operations done in advance to reduce the time spent inside
1165 # transactions.
1166 refs = []
1167 runs = set()
1168 datasetTypes: dict[str, DatasetType] = {}
1169 n_exposures = 0
1170 n_exposures_failed = 0
1171 n_ingests_failed = 0
1172 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
1173 assert exposure.record is not None, "Should be guaranteed by prep()"
1174 self.log.debug(
1175 "Attempting to ingest %d file%s from exposure %s:%s",
1176 *_log_msg_counter(exposure.files),
1177 exposure.record.instrument,
1178 exposure.record.obs_id,
1179 )
1181 try:
1182 for name, record in exposure.dependencyRecords.items():
1183 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records)
1184 inserted_or_updated = self.butler.registry.syncDimensionData(
1185 "exposure",
1186 exposure.record,
1187 update=update_exposure_records,
1188 )
1189 except Exception as e:
1190 self._on_ingest_failure(exposure, e)
1191 n_exposures_failed += 1
1192 self.log.warning(
1193 "Exposure %s:%s could not be registered: %s",
1194 exposure.record.instrument,
1195 exposure.record.obs_id,
1196 e,
1197 )
1198 if self.config.failFast:
1199 raise e
1200 continue
1202 if isinstance(inserted_or_updated, dict):
1203 # Exposure is in the registry and we updated it, so
1204 # syncDimensionData returned a dict.
1205 self.log.info(
1206 "Exposure %s:%s was already present, but columns %s were updated.",
1207 exposure.record.instrument,
1208 exposure.record.obs_id,
1209 str(list(inserted_or_updated.keys())),
1210 )
1212 # Determine the instrument so we can work out the dataset type.
1213 instrument = exposure.files[0].instrument
1214 assert (
1215 instrument is not None
1216 ), "file should have been removed from this list by prep if instrument could not be found"
1218 if raw_definition := getattr(instrument, "raw_definition", None):
1219 datasetTypeName, dimensions, storageClass = raw_definition
1220 if not (datasetType := datasetTypes.get(datasetTypeName)):
1221 datasetType = DatasetType(
1222 datasetTypeName, dimensions, storageClass, universe=self.butler.registry.dimensions
1223 )
1224 else:
1225 datasetType = self.datasetType
1226 if datasetType.name not in datasetTypes:
1227 self.butler.registry.registerDatasetType(datasetType)
1228 datasetTypes[datasetType.name] = datasetType
1230 # Override default run if nothing specified explicitly.
1231 if run is None:
1232 this_run = instrument.makeDefaultRawIngestRunName()
1233 else:
1234 this_run = run
1235 if this_run not in runs:
1236 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
1237 runs.add(this_run)
1238 try:
1239 datasets_for_exposure = self.ingestExposureDatasets(
1240 exposure,
1241 datasetType=datasetType,
1242 run=this_run,
1243 skip_existing_exposures=skip_existing_exposures,
1244 track_file_attrs=track_file_attrs,
1245 )
1246 except Exception as e:
1247 self._on_ingest_failure(exposure, e)
1248 n_ingests_failed += 1
1249 self.log.warning("Failed to ingest the following for reason: %s", e)
1250 for f in exposure.files:
1251 self.log.warning("- %s", f.filename)
1252 if self.config.failFast:
1253 raise e
1254 continue
1255 else:
1256 self._on_success(datasets_for_exposure)
1257 for dataset in datasets_for_exposure:
1258 refs.extend(dataset.refs)
1260 # Success for this exposure.
1261 n_exposures += 1
1262 self.log.info(
1263 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id
1264 )
1266 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
1268 @timeMethod
1269 def run(
1270 self,
1271 files: Iterable[ResourcePathExpression],
1272 *,
1273 pool: Optional[PoolType] = None,
1274 processes: int = 1,
1275 run: Optional[str] = None,
1276 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b",
1277 group_files: bool = True,
1278 skip_existing_exposures: bool = False,
1279 update_exposure_records: bool = False,
1280 track_file_attrs: bool = True,
1281 ) -> List[DatasetRef]:
1282 """Ingest files into a Butler data repository.
1284 This creates any new exposure or visit Dimension entries needed to
1285 identify the ingested files, creates new Dataset entries in the
1286 Registry and finally ingests the files themselves into the Datastore.
1287 Any needed instrument, detector, and physical_filter Dimension entries
1288 must exist in the Registry before `run` is called.
1290 Parameters
1291 ----------
1292 files : iterable `lsst.resources.ResourcePath`, `str` or path-like
1293 Paths to the files to be ingested. Can refer to directories.
1294 Will be made absolute if they are not already.
1295 pool : `multiprocessing.Pool`, optional
1296 If not `None`, a process pool with which to parallelize some
1297 operations.
1298 processes : `int`, optional
1299 The number of processes to use. Ignored if ``pool`` is not `None`.
1300 run : `str`, optional
1301 Name of a RUN-type collection to write to, overriding
1302 the default derived from the instrument name.
1303 file_filter : `str` or `re.Pattern`, optional
1304 Pattern to use to discover files to ingest within directories.
1305 The default is to search for FITS files. The regex applies to
1306 files within the directory.
1307 group_files : `bool`, optional
1308 Group files by directory if they have been discovered in
1309 directories. Will not affect files explicitly provided.
1310 skip_existing_exposures : `bool`, optional
1311 If `True` (`False` is default), skip raws that have already been
1312 ingested (i.e. raws for which we already have a dataset with the
1313 same data ID in the target collection, even if from another file).
1314 Note that this is much slower than just not passing
1315 already-ingested files as inputs, because we still need to read and
1316 process metadata to identify which exposures to search for. It
1317 also will not work reliably if multiple processes are attempting to
1318 ingest raws from the same exposure concurrently, in that different
1319 processes may still attempt to ingest the same raw and conflict,
1320 causing a failure that prevents other raws from the same exposure
1321 from being ingested.
1322 update_exposure_records : `bool`, optional
1323 If `True` (`False` is default), update existing exposure records
1324 that conflict with the new ones instead of rejecting them. THIS IS
1325 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1326 KNOWN TO BE BAD. This should usually be combined with
1327 ``skip_existing_exposures=True``.
1328 track_file_attrs : `bool`, optional
1329 Control whether file attributes such as the size or checksum should
1330 be tracked by the datastore. Whether this parameter is honored
1331 depends on the specific datastore implentation.
1333 Returns
1334 -------
1335 refs : `list` of `lsst.daf.butler.DatasetRef`
1336 Dataset references for ingested raws.
1338 Notes
1339 -----
1340 This method inserts all datasets for an exposure within a transaction,
1341 guaranteeing that partial exposures are never ingested. The exposure
1342 dimension record is inserted with `Registry.syncDimensionData` first
1343 (in its own transaction), which inserts only if a record with the same
1344 primary key does not already exist. This allows different files within
1345 the same exposure to be ingested in different runs.
1346 """
1348 refs = []
1349 bad_files = []
1350 n_exposures = 0
1351 n_exposures_failed = 0
1352 n_ingests_failed = 0
1353 if group_files:
1354 for group in ResourcePath.findFileResources(files, file_filter, group_files):
1355 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1356 group,
1357 pool=pool,
1358 processes=processes,
1359 run=run,
1360 skip_existing_exposures=skip_existing_exposures,
1361 update_exposure_records=update_exposure_records,
1362 track_file_attrs=track_file_attrs,
1363 )
1364 refs.extend(new_refs)
1365 bad_files.extend(bad)
1366 n_exposures += n_exp
1367 n_exposures_failed += n_exp_fail
1368 n_ingests_failed += n_ingest_fail
1369 else:
1370 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1371 ResourcePath.findFileResources(files, file_filter, group_files),
1372 pool=pool,
1373 processes=processes,
1374 run=run,
1375 skip_existing_exposures=skip_existing_exposures,
1376 update_exposure_records=update_exposure_records,
1377 )
1379 had_failure = False
1381 if bad_files:
1382 had_failure = True
1383 self.log.warning("Could not extract observation metadata from the following:")
1384 for f in bad_files:
1385 self.log.warning("- %s", f)
1387 self.log.info(
1388 "Successfully processed data from %d exposure%s with %d failure%s from exposure"
1389 " registration and %d failure%s from file ingest.",
1390 *_log_msg_counter(n_exposures),
1391 *_log_msg_counter(n_exposures_failed),
1392 *_log_msg_counter(n_ingests_failed),
1393 )
1394 if n_exposures_failed > 0 or n_ingests_failed > 0:
1395 had_failure = True
1396 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1398 if had_failure:
1399 raise RuntimeError("Some failures encountered during ingestion")
1401 return refs