Coverage for python/lsst/obs/base/ingest.py: 17%
343 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-11 02:55 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-11 02:55 -0800
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from collections import defaultdict
28from dataclasses import InitVar, dataclass
29from multiprocessing import Pool
30from typing import (
31 Any,
32 Callable,
33 ClassVar,
34 Dict,
35 Iterable,
36 Iterator,
37 List,
38 MutableMapping,
39 Optional,
40 Set,
41 Sized,
42 Tuple,
43 Type,
44 Union,
45)
47from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers
48from astro_metadata_translator.indexing import process_index_data, process_sidecar_data
49from lsst.afw.fits import readMetadata
50from lsst.daf.butler import (
51 Butler,
52 CollectionType,
53 DataCoordinate,
54 DatasetIdGenEnum,
55 DatasetRef,
56 DatasetType,
57 DimensionRecord,
58 DimensionUniverse,
59 FileDataset,
60 Formatter,
61 Progress,
62)
63from lsst.pex.config import ChoiceField, Config, Field
64from lsst.pipe.base import Instrument, Task
65from lsst.resources import ResourcePath, ResourcePathExpression
66from lsst.utils.timer import timeMethod
68from ._instrument import makeExposureRecordFromObsInfo
70# multiprocessing.Pool is actually a function, not a type, and the real type
71# isn't exposed, so we can't used it annotations, so we'll just punt on it via
72# this alias instead.
73PoolType = Any
76def _do_nothing(*args: Any, **kwargs: Any) -> None:
77 """Do nothing.
79 This is a function that accepts anything and does nothing.
80 For use as a default in callback arguments.
81 """
82 pass
85def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]:
86 """Count the iterable and return the count and plural modifier.
88 Parameters
89 ----------
90 noun : `Sized` or `int`
91 Thing to count. If given an integer it is assumed to be the count
92 to use to calculate modifier.
94 Returns
95 -------
96 num : `int`
97 Number of items found in ``noun``.
98 modifier : `str`
99 Character to add to the end of a string referring to these items
100 to indicate whether it was a single item or not. Returns empty
101 string if there is one item or "s" otherwise.
103 Examples
104 --------
106 .. code-block:: python
108 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
109 """
110 if isinstance(noun, int):
111 num = noun
112 else:
113 num = len(noun)
114 return num, "" if num == 1 else "s"
117@dataclass
118class RawFileDatasetInfo:
119 """Information about a single dataset within a raw file."""
121 dataId: DataCoordinate
122 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
124 obsInfo: ObservationInfo
125 """Standardized observation metadata extracted directly from the file
126 headers (`astro_metadata_translator.ObservationInfo`).
127 """
130@dataclass
131class RawFileData:
132 """Information about a single raw file, used during ingest."""
134 datasets: List[RawFileDatasetInfo]
135 """The information describing each dataset within this raw file.
136 (`list` of `RawFileDatasetInfo`)
137 """
139 filename: ResourcePath
140 """URI of the file this information was extracted from (`str`).
142 This is the path prior to ingest, not the path after ingest.
143 """
145 FormatterClass: Type[Formatter]
146 """Formatter class that should be used to ingest this file (`type`; as
147 subclass of `Formatter`).
148 """
150 instrument: Optional[Instrument]
151 """The `Instrument` instance associated with this file. Can be `None`
152 if ``datasets`` is an empty list."""
155@dataclass
156class RawExposureData:
157 """Information about a complete raw exposure, used during ingest."""
159 dataId: DataCoordinate
160 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
161 """
163 files: List[RawFileData]
164 """List of structures containing file-level information.
165 """
167 universe: InitVar[DimensionUniverse]
168 """Set of all known dimensions.
169 """
171 record: DimensionRecord
172 """The exposure `DimensionRecord` that must be inserted into the
173 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
174 """
176 dependencyRecords: Dict[str, DimensionRecord]
177 """Additional records that must be inserted into the
178 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record``
179 (e.g., to satisfy foreign key constraints), indexed by the dimension name.
180 """
183def makeTransferChoiceField(
184 doc: str = "How to transfer files (None for no transfer).", default: str = "auto"
185) -> ChoiceField:
186 """Create a Config field with options for transferring data between repos.
188 The allowed options for the field are exactly those supported by
189 `lsst.daf.butler.Datastore.ingest`.
191 Parameters
192 ----------
193 doc : `str`
194 Documentation for the configuration field.
195 default : `str`, optional
196 Default transfer mode for the field.
198 Returns
199 -------
200 field : `lsst.pex.config.ChoiceField`
201 Configuration field.
202 """
203 return ChoiceField(
204 doc=doc,
205 dtype=str,
206 allowed={
207 "move": "move",
208 "copy": "copy",
209 "auto": "choice will depend on datastore",
210 "direct": "use URI to ingested file directly in datastore",
211 "link": "hard link falling back to symbolic link",
212 "hardlink": "hard link",
213 "symlink": "symbolic (soft) link",
214 "relsymlink": "relative symbolic link",
215 },
216 optional=True,
217 default=default,
218 )
221class RawIngestConfig(Config):
222 """Configuration class for RawIngestTask."""
224 transfer = makeTransferChoiceField()
225 failFast: Field[bool] = Field(
226 dtype=bool,
227 default=False,
228 doc="If True, stop ingest as soon as any problem is encountered with any file. "
229 "Otherwise problem files will be skipped and logged and a report issued at completion.",
230 )
233class RawIngestTask(Task):
234 """Driver Task for ingesting raw data into Gen3 Butler repositories.
236 Parameters
237 ----------
238 config : `RawIngestConfig`
239 Configuration for the task.
240 butler : `~lsst.daf.butler.Butler`
241 Writeable butler instance, with ``butler.run`` set to the appropriate
242 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
243 datasets.
244 on_success : `Callable`, optional
245 A callback invoked when all of the raws associated with an exposure
246 are ingested. Will be passed a list of `FileDataset` objects, each
247 containing one or more resolved `DatasetRef` objects. If this callback
248 raises it will interrupt the entire ingest process, even if
249 `RawIngestConfig.failFast` is `False`.
250 on_metadata_failure : `Callable`, optional
251 A callback invoked when a failure occurs trying to translate the
252 metadata for a file. Will be passed the URI and the exception, in
253 that order, as positional arguments. Guaranteed to be called in an
254 ``except`` block, allowing the callback to re-raise or replace (with
255 ``raise ... from``) to override the task's usual error handling (before
256 `RawIngestConfig.failFast` logic occurs).
257 on_ingest_failure : `Callable`, optional
258 A callback invoked when dimension record or dataset insertion into the
259 database fails for an exposure. Will be passed a `RawExposureData`
260 instance and the exception, in that order, as positional arguments.
261 Guaranteed to be called in an ``except`` block, allowing the callback
262 to re-raise or replace (with ``raise ... from``) to override the task's
263 usual error handling (before `RawIngestConfig.failFast` logic occurs).
264 **kwargs
265 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
266 constructor.
268 Notes
269 -----
270 Each instance of `RawIngestTask` writes to the same Butler. Each
271 invocation of `RawIngestTask.run` ingests a list of files.
272 """
274 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig
276 _DefaultName: ClassVar[str] = "ingest"
278 def getDatasetType(self) -> DatasetType:
279 """Return the DatasetType of the datasets ingested by this Task."""
280 return DatasetType(
281 "raw",
282 ("instrument", "detector", "exposure"),
283 "Exposure",
284 universe=self.butler.registry.dimensions,
285 )
287 # Mypy can not determine that the config passed to super() is this type.
288 config: RawIngestConfig
290 def __init__(
291 self,
292 config: RawIngestConfig,
293 *,
294 butler: Butler,
295 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
296 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing,
297 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
298 **kwargs: Any,
299 ):
300 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
301 super().__init__(config, **kwargs)
302 self.butler = butler
303 self.universe = self.butler.registry.dimensions
304 self.datasetType = self.getDatasetType()
305 self._on_success = on_success
306 self._on_metadata_failure = on_metadata_failure
307 self._on_ingest_failure = on_ingest_failure
308 self.progress = Progress("obs.base.RawIngestTask")
310 # Import all the instrument classes so that we ensure that we
311 # have all the relevant metadata translators loaded.
312 Instrument.importAll(self.butler.registry)
314 def _reduce_kwargs(self) -> Dict[str, Any]:
315 # Add extra parameters to pickle.
316 return dict(
317 **super()._reduce_kwargs(),
318 butler=self.butler,
319 on_success=self._on_success,
320 on_metadata_failure=self._on_metadata_failure,
321 on_ingest_failure=self._on_ingest_failure,
322 )
324 def _determine_instrument_formatter(
325 self, dataId: DataCoordinate, filename: ResourcePath
326 ) -> Tuple[Optional[Instrument], Type[Formatter]]:
327 """Determine the instrument and formatter class.
329 Parameters
330 ----------
331 dataId : `lsst.daf.butler.DataCoordinate`
332 The dataId associated with this dataset.
333 filename : `lsst.resources.ResourcePath`
334 URI of file used for error reporting.
336 Returns
337 -------
338 instrument : `Instrument` or `None`
339 Instance of the `Instrument` associated with this dataset. `None`
340 indicates that the instrument could not be determined.
341 formatterClass : `type`
342 Class to be used as the formatter for this dataset.
343 """
344 # The data model currently assumes that whilst multiple datasets
345 # can be associated with a single file, they must all share the
346 # same formatter.
347 try:
348 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore
349 except LookupError as e:
350 self._on_metadata_failure(filename, e)
351 self.log.warning(
352 "Instrument %s for file %s not known to registry", dataId["instrument"], filename
353 )
354 if self.config.failFast:
355 raise RuntimeError(
356 f"Instrument {dataId['instrument']} for file {filename} not known to registry"
357 ) from e
358 FormatterClass = Formatter
359 # Indicate that we could not work out the instrument.
360 instrument = None
361 else:
362 assert instrument is not None, "Should be guaranted by fromName succeeding."
363 FormatterClass = instrument.getRawFormatter(dataId)
364 return instrument, FormatterClass
366 def extractMetadata(self, filename: ResourcePath) -> RawFileData:
367 """Extract and process metadata from a single raw file.
369 Parameters
370 ----------
371 filename : `lsst.resources.ResourcePath`
372 URI to the file.
374 Returns
375 -------
376 data : `RawFileData`
377 A structure containing the metadata extracted from the file,
378 as well as the original filename. All fields will be populated,
379 but the `RawFileData.dataId` attribute will be a minimal
380 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
381 ``instrument`` field will be `None` if there is a problem
382 with metadata extraction.
384 Notes
385 -----
386 Assumes that there is a single dataset associated with the given
387 file. Instruments using a single file to store multiple datasets
388 must implement their own version of this method.
390 By default the method will catch all exceptions unless the ``failFast``
391 configuration item is `True`. If an error is encountered the
392 `_on_metadata_failure()` method will be called. If no exceptions
393 result and an error was encountered the returned object will have
394 a null-instrument class and no datasets.
396 This method supports sidecar JSON files which can be used to
397 extract metadata without having to read the data file itself.
398 The sidecar file is always used if found.
399 """
400 sidecar_fail_msg = "" # Requires prepended space when set.
401 try:
402 sidecar_file = filename.updatedExtension(".json")
403 if sidecar_file.exists():
404 content = json.loads(sidecar_file.read())
405 headers = [process_sidecar_data(content)]
406 sidecar_fail_msg = " (via sidecar)"
407 else:
408 # Read the metadata from the data file itself.
410 # For remote files download the entire file to get the
411 # header. This is very inefficient and it would be better
412 # to have some way of knowing where in the file the headers
413 # are and to only download those parts of the file.
414 with filename.as_local() as local_file:
415 # Read the primary. This might be sufficient.
416 header = readMetadata(local_file.ospath, 0)
418 try:
419 # Try to work out a translator class early.
420 translator_class = MetadataTranslator.determine_translator(
421 header, filename=str(filename)
422 )
423 except ValueError:
424 # Primary header was not sufficient (maybe this file
425 # has been compressed or is a MEF with minimal
426 # primary). Read second header and merge with primary.
427 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
429 # Try again to work out a translator class, letting this
430 # fail.
431 translator_class = MetadataTranslator.determine_translator(header, filename=str(filename))
433 # Request the headers to use for ingest
434 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header))
436 # Add each header to the dataset list
437 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
439 except Exception as e:
440 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
441 # Indicate to the caller that we failed to read.
442 datasets = []
443 formatterClass = Formatter
444 instrument = None
445 self._on_metadata_failure(filename, e)
446 if self.config.failFast:
447 raise RuntimeError(
448 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}"
449 ) from e
450 else:
451 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
452 # The data model currently assumes that whilst multiple datasets
453 # can be associated with a single file, they must all share the
454 # same formatter.
455 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
456 if instrument is None:
457 datasets = []
459 return RawFileData(
460 datasets=datasets,
461 filename=filename,
462 # MyPy wants this to be a non-abstract class, which is not true
463 # for the error case where instrument is None and datasets=[].
464 FormatterClass=formatterClass, # type: ignore
465 instrument=instrument,
466 )
468 @classmethod
469 def getObservationInfoSubsets(cls) -> Tuple[Set, Set]:
470 """Return subsets of fields in the `ObservationInfo` that we care about
472 These fields will be used in constructing an exposure record.
474 Returns
475 -------
476 required : `set`
477 Set of `ObservationInfo` field names that are required.
478 optional : `set`
479 Set of `ObservationInfo` field names we will use if they are
480 available.
481 """
482 # Marking the new properties "group_counter_*" and
483 # "has_simulated_content" as required, assumes that we either
484 # recreate any existing index/sidecar files that include translated
485 # values, or else allow astro_metadata_translator to fill in
486 # defaults.
487 required = {
488 "datetime_begin",
489 "datetime_end",
490 "detector_num",
491 "exposure_id",
492 "exposure_time",
493 "group_counter_end",
494 "group_counter_start",
495 "has_simulated_content",
496 "instrument",
497 "observation_id",
498 "observation_type",
499 "physical_filter",
500 }
501 optional = {
502 "altaz_begin",
503 "boresight_rotation_coord",
504 "boresight_rotation_angle",
505 "dark_time",
506 "exposure_group",
507 "tracking_radec",
508 "object",
509 "observation_counter",
510 "observation_reason",
511 "observing_day",
512 "science_program",
513 "visit_id",
514 }
515 return required, optional
517 def _calculate_dataset_info(
518 self, header: Union[MutableMapping[str, Any], ObservationInfo], filename: ResourcePath
519 ) -> RawFileDatasetInfo:
520 """Calculate a RawFileDatasetInfo from the supplied information.
522 Parameters
523 ----------
524 header : Mapping or `astro_metadata_translator.ObservationInfo`
525 Header from the dataset or previously-translated content.
526 filename : `lsst.resources.ResourcePath`
527 Filename to use for error messages.
529 Returns
530 -------
531 dataset : `RawFileDatasetInfo`
532 The dataId, and observation information associated with this
533 dataset.
534 """
535 required, optional = self.getObservationInfoSubsets()
536 if isinstance(header, ObservationInfo):
537 obsInfo = header
538 missing = []
539 # Need to check the required properties are present.
540 for property in required:
541 # getattr does not need to be protected because it is using
542 # the defined list above containing properties that must exist.
543 value = getattr(obsInfo, property)
544 if value is None:
545 missing.append(property)
546 if missing:
547 raise ValueError(
548 f"Requested required properties are missing from file {filename}: {missing} (via JSON)"
549 )
551 else:
552 obsInfo = ObservationInfo(
553 header,
554 pedantic=False,
555 filename=str(filename),
556 required=required,
557 subset=required | optional,
558 )
560 dataId = DataCoordinate.standardize(
561 instrument=obsInfo.instrument,
562 exposure=obsInfo.exposure_id,
563 detector=obsInfo.detector_num,
564 universe=self.universe,
565 )
566 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
568 def locateAndReadIndexFiles(
569 self, files: Iterable[ResourcePath]
570 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]:
571 """Given a list of files, look for index files and read them.
573 Index files can either be explicitly in the list of files to
574 ingest, or else located in the same directory as a file to ingest.
575 Index entries are always used if present.
577 Parameters
578 ----------
579 files : iterable over `lsst.resources.ResourcePath`
580 URIs to the files to be ingested.
582 Returns
583 -------
584 index : `dict` [`ResourcePath`, Any]
585 Merged contents of all relevant index files found. These can
586 be explicitly specified index files or ones found in the
587 directory alongside a data file to be ingested.
588 updated_files : `list` of `ResourcePath`
589 Updated list of the input files with entries removed that were
590 found listed in an index file. Order is not guaranteed to
591 match the order of the files given to this routine.
592 good_index_files: `set` [ `ResourcePath` ]
593 Index files that were successfully read.
594 bad_index_files: `set` [ `ResourcePath` ]
595 Files that looked like index files but failed to read properly.
596 """
597 # Convert the paths to absolute for easy comparison with index content.
598 # Do not convert to real paths since we have to assume that index
599 # files are in this location and not the location which it links to.
600 files = tuple(f.abspath() for f in files)
602 # Index files must be named this.
603 index_root_file = "_index.json"
605 # Group the files by directory.
606 files_by_directory = defaultdict(set)
608 for path in files:
609 directory, file_in_dir = path.split()
610 files_by_directory[directory].add(file_in_dir)
612 # All the metadata read from index files with keys of full path.
613 index_entries: Dict[ResourcePath, Any] = {}
615 # Index files we failed to read.
616 bad_index_files = set()
618 # Any good index files that were found and used.
619 good_index_files = set()
621 # Look for index files in those directories.
622 for directory, files_in_directory in files_by_directory.items():
623 possible_index_file = directory.join(index_root_file)
624 if possible_index_file.exists():
625 # If we are explicitly requesting an index file the
626 # messages should be different.
627 index_msg = "inferred"
628 is_implied = True
629 if index_root_file in files_in_directory:
630 index_msg = "explicit"
631 is_implied = False
633 # Try to read the index file and catch and report any
634 # problems.
635 try:
636 content = json.loads(possible_index_file.read())
637 index = process_index_data(content, force_dict=True)
638 # mypy should in theory know that this is a mapping
639 # from the overload type annotation of process_index_data.
640 assert isinstance(index, MutableMapping)
641 except Exception as e:
642 # Only trigger the callback if the index file
643 # was asked for explicitly. Triggering on implied file
644 # might be surprising.
645 if not is_implied:
646 self._on_metadata_failure(possible_index_file, e)
647 if self.config.failFast:
648 raise RuntimeError(
649 f"Problem reading index file from {index_msg} location {possible_index_file}"
650 ) from e
651 bad_index_files.add(possible_index_file)
652 continue
654 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
655 good_index_files.add(possible_index_file)
657 # Go through the index adding entries for files.
658 # If we have non-index files in this directory marked for
659 # ingest we should only get index information for those.
660 # If the index file was explicit we use all entries.
661 if is_implied:
662 files_to_ingest = files_in_directory
663 else:
664 files_to_ingest = set(index)
666 # Copy relevant metadata into a single dict for all index
667 # entries.
668 for file_in_dir in files_to_ingest:
669 # Skip an explicitly specified index file.
670 # This should never happen because an explicit index
671 # file will force ingest of all files in the index
672 # and not use the explicit file list. If somehow
673 # this is not true we continue. Raising an exception
674 # seems like the wrong thing to do since this is harmless.
675 if file_in_dir == index_root_file:
676 self.log.info(
677 "Logic error found scanning directory %s. Please file ticket.", directory
678 )
679 continue
680 if file_in_dir in index:
681 file = directory.join(file_in_dir)
682 if file in index_entries:
683 # ObservationInfo overrides raw metadata
684 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance(
685 index_entries[file], ObservationInfo
686 ):
687 self.log.warning(
688 "File %s already specified in an index file but overriding"
689 " with ObservationInfo content from %s",
690 file,
691 possible_index_file,
692 )
693 else:
694 self.log.warning(
695 "File %s already specified in an index file, ignoring content from %s",
696 file,
697 possible_index_file,
698 )
699 # Do nothing in this case
700 continue
702 index_entries[file] = index[file_in_dir]
704 # Remove files from list that have index entries and also
705 # any files that we determined to be explicit index files
706 # or any index files that we failed to read.
707 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
709 # The filtered list loses the initial order. Retaining the order
710 # is good for testing but does have a cost if there are many
711 # files when copying the good values out. A dict would have faster
712 # lookups (using the files as keys) but use more memory.
713 ordered = [f for f in filtered if f in files]
715 return index_entries, ordered, good_index_files, bad_index_files
717 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]:
718 """Convert index entries to RawFileData.
720 Parameters
721 ----------
722 index_entries : `dict` [`ResourcePath`, Any]
723 Dict indexed by name of file to ingest and with keys either
724 raw metadata or translated
725 `~astro_metadata_translator.ObservationInfo`.
727 Returns
728 -------
729 data : `list` [ `RawFileData` ]
730 Structures containing the metadata extracted from the file,
731 as well as the original filename. All fields will be populated,
732 but the `RawFileData.dataId` attributes will be minimal
733 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances.
734 """
735 fileData = []
736 for filename, metadata in index_entries.items():
737 try:
738 datasets = [self._calculate_dataset_info(metadata, filename)]
739 except Exception as e:
740 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e)
741 datasets = []
742 formatterClass = Formatter
743 instrument = None
744 self._on_metadata_failure(filename, e)
745 if self.config.failFast:
746 raise RuntimeError(
747 f"Problem extracting metadata for file {filename} found in index file"
748 ) from e
749 else:
750 instrument, formatterClass = self._determine_instrument_formatter(
751 datasets[0].dataId, filename
752 )
753 if instrument is None:
754 datasets = []
755 fileData.append(
756 RawFileData(
757 datasets=datasets,
758 filename=filename,
759 # MyPy wants this to be a non-abstract class, which is not
760 # true for the error case where instrument is None and
761 # datasets=[].
762 FormatterClass=formatterClass, # type: ignore
763 instrument=instrument,
764 )
765 )
766 return fileData
768 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
769 """Group an iterable of `RawFileData` by exposure.
771 Parameters
772 ----------
773 files : iterable of `RawFileData`
774 File-level information to group.
776 Returns
777 -------
778 exposures : `list` of `RawExposureData`
779 A list of structures that group the file-level information by
780 exposure. All fields will be populated. The
781 `RawExposureData.dataId` attributes will be minimal (unexpanded)
782 `~lsst.daf.butler.DataCoordinate` instances.
783 """
784 exposureDimensions = self.universe["exposure"].graph
785 byExposure = defaultdict(list)
786 for f in files:
787 # Assume that the first dataset is representative for the file.
788 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
790 return [
791 RawExposureData(
792 dataId=dataId,
793 files=exposureFiles,
794 universe=self.universe,
795 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe),
796 dependencyRecords=self.makeDependencyRecords(
797 exposureFiles[0].datasets[0].obsInfo, self.universe
798 ),
799 )
800 for dataId, exposureFiles in byExposure.items()
801 ]
803 def makeExposureRecord(
804 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any
805 ) -> DimensionRecord:
806 """Construct a registry record for an exposure
808 This is a method that subclasses will often want to customize. This can
809 often be done by calling this base class implementation with additional
810 ``kwargs``.
812 Parameters
813 ----------
814 obsInfo : `ObservationInfo`
815 Observation details for (one of the components of) the exposure.
816 universe : `DimensionUniverse`
817 Set of all known dimensions.
818 **kwargs
819 Additional field values for this record.
821 Returns
822 -------
823 record : `DimensionRecord`
824 The exposure record that must be inserted into the
825 `~lsst.daf.butler.Registry` prior to file-level ingest.
826 """
827 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs)
829 def makeDependencyRecords(
830 self, obsInfo: ObservationInfo, universe: DimensionUniverse
831 ) -> Dict[str, DimensionRecord]:
832 """Construct dependency records
834 These dependency records will be inserted into the
835 `~lsst.daf.butler.Registry` before the exposure records, because they
836 are dependencies of the exposure. This allows an opportunity to satisfy
837 foreign key constraints that exist because of dimensions related to the
838 exposure.
840 This is a method that subclasses may want to customize, if they've
841 added dimensions that relate to an exposure.
843 Parameters
844 ----------
845 obsInfo : `ObservationInfo`
846 Observation details for (one of the components of) the exposure.
847 universe : `DimensionUniverse`
848 Set of all known dimensions.
850 Returns
851 -------
852 records : `dict` [`str`, `DimensionRecord`]
853 The records to insert, indexed by dimension name.
854 """
855 return {}
857 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
858 """Expand the data IDs associated with a raw exposure.
860 This adds the metadata records.
862 Parameters
863 ----------
864 exposure : `RawExposureData`
865 A structure containing information about the exposure to be
866 ingested. Must have `RawExposureData.record` populated. Should
867 be considered consumed upon return.
869 Returns
870 -------
871 exposure : `RawExposureData`
872 An updated version of the input structure, with
873 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
874 updated to data IDs for which
875 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
876 """
877 # We start by expanded the exposure-level data ID; we won't use that
878 # directly in file ingest, but this lets us do some database lookups
879 # once per exposure instead of once per file later.
880 data.dataId = self.butler.registry.expandDataId(
881 data.dataId,
882 # We pass in the records we'll be inserting shortly so they aren't
883 # looked up from the database. We do expect instrument and filter
884 # records to be retrieved from the database here (though the
885 # Registry may cache them so there isn't a lookup every time).
886 records={"exposure": data.record},
887 )
888 # Now we expand the per-file (exposure+detector) data IDs. This time
889 # we pass in the records we just retrieved from the exposure data ID
890 # expansion.
891 for file in data.files:
892 for dataset in file.datasets:
893 dataset.dataId = self.butler.registry.expandDataId(
894 dataset.dataId, records=data.dataId.records
895 )
896 return data
898 def prep(
899 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None, processes: int = 1
900 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]:
901 """Perform all non-database-updating ingest preprocessing steps.
903 Parameters
904 ----------
905 files : iterable over `str` or path-like objects
906 Paths to the files to be ingested. Will be made absolute
907 if they are not already.
908 pool : `multiprocessing.Pool`, optional
909 If not `None`, a process pool with which to parallelize some
910 operations.
911 processes : `int`, optional
912 The number of processes to use. Ignored if ``pool`` is not `None`.
914 Returns
915 -------
916 exposures : `Iterator` [ `RawExposureData` ]
917 Data structures containing dimension records, filenames, and data
918 IDs to be ingested (one structure for each exposure).
919 bad_files : `list` of `str`
920 List of all the files that could not have metadata extracted.
921 """
922 if pool is None and processes > 1:
923 pool = Pool(processes)
924 mapFunc = map if pool is None else pool.imap_unordered
926 def _partition_good_bad(
927 file_data: Iterable[RawFileData],
928 ) -> Tuple[List[RawFileData], List[ResourcePath]]:
929 """Filter out bad files and return good with list of bad."""
930 good_files = []
931 bad_files = []
932 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"):
933 if not fileDatum.datasets:
934 bad_files.append(fileDatum.filename)
935 else:
936 good_files.append(fileDatum)
937 return good_files, bad_files
939 # Look for index files and read them.
940 # There should be far fewer index files than data files.
941 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
942 if bad_index_files:
943 self.log.info("Failed to read the following explicitly requested index files:")
944 for bad in sorted(bad_index_files):
945 self.log.info("- %s", bad)
947 # Now convert all the index file entries to standard form for ingest.
948 processed_bad_index_files: List[ResourcePath] = []
949 indexFileData = self.processIndexEntries(index_entries)
950 if indexFileData:
951 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData)
952 self.log.info(
953 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s",
954 *_log_msg_counter(indexFileData),
955 *_log_msg_counter(good_index_files),
956 *_log_msg_counter(processed_bad_index_files),
957 )
959 # Extract metadata and build per-detector regions.
960 # This could run in a subprocess so collect all output
961 # before looking at failures.
962 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
964 # Filter out all the failed reads and store them for later
965 # reporting.
966 good_file_data, bad_files = _partition_good_bad(fileData)
967 self.log.info(
968 "Successfully extracted metadata from %d file%s with %d failure%s",
969 *_log_msg_counter(good_file_data),
970 *_log_msg_counter(bad_files),
971 )
973 # Combine with data from index files.
974 good_file_data.extend(indexFileData)
975 bad_files.extend(processed_bad_index_files)
976 bad_files.extend(bad_index_files)
978 # Use that metadata to group files (and extracted metadata) by
979 # exposure. Never parallelized because it's intrinsically a gather
980 # step.
981 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data)
983 # The next operation operates on RawExposureData instances (one at
984 # a time) in-place and then returns the modified instance. We call it
985 # as a pass-through instead of relying on the arguments we pass in to
986 # have been modified because in the parallel case those arguments are
987 # going to be pickled and unpickled, and I'm not certain
988 # multiprocessing is careful enough with that for output arguments to
989 # work.
991 # Expand the data IDs to include all dimension metadata; we need this
992 # because we may need to generate path templates that rely on that
993 # metadata.
994 # This is the first step that involves actual database calls (but just
995 # SELECTs), so if there's going to be a problem with connections vs.
996 # multiple processes, or lock contention (in SQLite) slowing things
997 # down, it'll happen here.
998 return mapFunc(self.expandDataIds, exposureData), bad_files
1000 def ingestExposureDatasets(
1001 self,
1002 exposure: RawExposureData,
1003 *,
1004 run: Optional[str] = None,
1005 skip_existing_exposures: bool = False,
1006 track_file_attrs: bool = True,
1007 ) -> List[FileDataset]:
1008 """Ingest all raw files in one exposure.
1010 Parameters
1011 ----------
1012 exposure : `RawExposureData`
1013 A structure containing information about the exposure to be
1014 ingested. Must have `RawExposureData.records` populated and all
1015 data ID attributes expanded.
1016 run : `str`, optional
1017 Name of a RUN-type collection to write to, overriding
1018 ``self.butler.run``.
1019 skip_existing_exposures : `bool`, optional
1020 If `True` (`False` is default), skip raws that have already been
1021 ingested (i.e. raws for which we already have a dataset with the
1022 same data ID in the target collection, even if from another file).
1023 Note that this is much slower than just not passing
1024 already-ingested files as inputs, because we still need to read and
1025 process metadata to identify which exposures to search for. It
1026 also will not work reliably if multiple processes are attempting to
1027 ingest raws from the same exposure concurrently, in that different
1028 processes may still attempt to ingest the same raw and conflict,
1029 causing a failure that prevents other raws from the same exposure
1030 from being ingested.
1031 track_file_attrs : `bool`, optional
1032 Control whether file attributes such as the size or checksum should
1033 be tracked by the datastore. Whether this parameter is honored
1034 depends on the specific datastore implentation.
1036 Returns
1037 -------
1038 datasets : `list` of `lsst.daf.butler.FileDataset`
1039 Per-file structures identifying the files ingested and their
1040 dataset representation in the data repository.
1041 """
1042 if skip_existing_exposures:
1043 existing = {
1044 ref.dataId
1045 for ref in self.butler.registry.queryDatasets(
1046 self.datasetType,
1047 collections=[run],
1048 dataId=exposure.dataId,
1049 )
1050 }
1051 else:
1052 existing = set()
1053 datasets = []
1054 for file in exposure.files:
1055 refs = [DatasetRef(self.datasetType, d.dataId) for d in file.datasets if d.dataId not in existing]
1056 if refs:
1057 datasets.append(
1058 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass)
1059 )
1061 # Raw files are preferentially ingested using a UUID derived from
1062 # the collection name and dataId.
1063 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN):
1064 mode = DatasetIdGenEnum.DATAID_TYPE_RUN
1065 else:
1066 mode = DatasetIdGenEnum.UNIQUE
1067 self.butler.ingest(
1068 *datasets,
1069 transfer=self.config.transfer,
1070 run=run,
1071 idGenerationMode=mode,
1072 record_validation_info=track_file_attrs,
1073 )
1074 return datasets
1076 def ingestFiles(
1077 self,
1078 files: Iterable[ResourcePath],
1079 *,
1080 pool: Optional[PoolType] = None,
1081 processes: int = 1,
1082 run: Optional[str] = None,
1083 skip_existing_exposures: bool = False,
1084 update_exposure_records: bool = False,
1085 track_file_attrs: bool = True,
1086 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]:
1087 """Ingest files into a Butler data repository.
1089 This creates any new exposure or visit Dimension entries needed to
1090 identify the ingested files, creates new Dataset entries in the
1091 Registry and finally ingests the files themselves into the Datastore.
1092 Any needed instrument, detector, and physical_filter Dimension entries
1093 must exist in the Registry before `run` is called.
1095 Parameters
1096 ----------
1097 files : iterable over `lsst.resources.ResourcePath`
1098 URIs to the files to be ingested.
1099 pool : `multiprocessing.Pool`, optional
1100 If not `None`, a process pool with which to parallelize some
1101 operations.
1102 processes : `int`, optional
1103 The number of processes to use. Ignored if ``pool`` is not `None`.
1104 run : `str`, optional
1105 Name of a RUN-type collection to write to, overriding
1106 the default derived from the instrument name.
1107 skip_existing_exposures : `bool`, optional
1108 If `True` (`False` is default), skip raws that have already been
1109 ingested (i.e. raws for which we already have a dataset with the
1110 same data ID in the target collection, even if from another file).
1111 Note that this is much slower than just not passing
1112 already-ingested files as inputs, because we still need to read and
1113 process metadata to identify which exposures to search for. It
1114 also will not work reliably if multiple processes are attempting to
1115 ingest raws from the same exposure concurrently, in that different
1116 processes may still attempt to ingest the same raw and conflict,
1117 causing a failure that prevents other raws from the same exposure
1118 from being ingested.
1119 update_exposure_records : `bool`, optional
1120 If `True` (`False` is default), update existing exposure records
1121 that conflict with the new ones instead of rejecting them. THIS IS
1122 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1123 KNOWN TO BE BAD. This should usually be combined with
1124 ``skip_existing_exposures=True``.
1125 track_file_attrs : `bool`, optional
1126 Control whether file attributes such as the size or checksum should
1127 be tracked by the datastore. Whether this parameter is honored
1128 depends on the specific datastore implentation.
1130 Returns
1131 -------
1132 refs : `list` of `lsst.daf.butler.DatasetRef`
1133 Dataset references for ingested raws.
1134 bad_files : `list` of `ResourcePath`
1135 Given paths that could not be ingested.
1136 n_exposures : `int`
1137 Number of exposures successfully ingested.
1138 n_exposures_failed : `int`
1139 Number of exposures that failed when inserting dimension data.
1140 n_ingests_failed : `int`
1141 Number of exposures that failed when ingesting raw datasets.
1142 """
1144 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
1146 # Up to this point, we haven't modified the data repository at all.
1147 # Now we finally do that, with one transaction per exposure. This is
1148 # not parallelized at present because the performance of this step is
1149 # limited by the database server. That may or may not change in the
1150 # future once we increase our usage of bulk inserts and reduce our
1151 # usage of savepoints; we've tried to get everything but the database
1152 # operations done in advance to reduce the time spent inside
1153 # transactions.
1154 self.butler.registry.registerDatasetType(self.datasetType)
1156 refs = []
1157 runs = set()
1158 n_exposures = 0
1159 n_exposures_failed = 0
1160 n_ingests_failed = 0
1161 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
1162 assert exposure.record is not None, "Should be guaranteed by prep()"
1163 self.log.debug(
1164 "Attempting to ingest %d file%s from exposure %s:%s",
1165 *_log_msg_counter(exposure.files),
1166 exposure.record.instrument,
1167 exposure.record.obs_id,
1168 )
1170 try:
1171 for name, record in exposure.dependencyRecords.items():
1172 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records)
1173 inserted_or_updated = self.butler.registry.syncDimensionData(
1174 "exposure",
1175 exposure.record,
1176 update=update_exposure_records,
1177 )
1178 except Exception as e:
1179 self._on_ingest_failure(exposure, e)
1180 n_exposures_failed += 1
1181 self.log.warning(
1182 "Exposure %s:%s could not be registered: %s",
1183 exposure.record.instrument,
1184 exposure.record.obs_id,
1185 e,
1186 )
1187 if self.config.failFast:
1188 raise e
1189 continue
1191 if isinstance(inserted_or_updated, dict):
1192 # Exposure is in the registry and we updated it, so
1193 # syncDimensionData returned a dict.
1194 self.log.info(
1195 "Exposure %s:%s was already present, but columns %s were updated.",
1196 exposure.record.instrument,
1197 exposure.record.obs_id,
1198 str(list(inserted_or_updated.keys())),
1199 )
1201 # Override default run if nothing specified explicitly.
1202 if run is None:
1203 instrument = exposure.files[0].instrument
1204 assert (
1205 instrument is not None
1206 ), "file should have been removed from this list by prep if instrument could not be found"
1207 this_run = instrument.makeDefaultRawIngestRunName()
1208 else:
1209 this_run = run
1210 if this_run not in runs:
1211 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
1212 runs.add(this_run)
1213 try:
1214 datasets_for_exposure = self.ingestExposureDatasets(
1215 exposure,
1216 run=this_run,
1217 skip_existing_exposures=skip_existing_exposures,
1218 track_file_attrs=track_file_attrs,
1219 )
1220 except Exception as e:
1221 self._on_ingest_failure(exposure, e)
1222 n_ingests_failed += 1
1223 self.log.warning("Failed to ingest the following for reason: %s", e)
1224 for f in exposure.files:
1225 self.log.warning("- %s", f.filename)
1226 if self.config.failFast:
1227 raise e
1228 continue
1229 else:
1230 self._on_success(datasets_for_exposure)
1231 for dataset in datasets_for_exposure:
1232 refs.extend(dataset.refs)
1234 # Success for this exposure.
1235 n_exposures += 1
1236 self.log.info(
1237 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id
1238 )
1240 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
1242 @timeMethod
1243 def run(
1244 self,
1245 files: Iterable[ResourcePathExpression],
1246 *,
1247 pool: Optional[PoolType] = None,
1248 processes: int = 1,
1249 run: Optional[str] = None,
1250 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b",
1251 group_files: bool = True,
1252 skip_existing_exposures: bool = False,
1253 update_exposure_records: bool = False,
1254 track_file_attrs: bool = True,
1255 ) -> List[DatasetRef]:
1256 """Ingest files into a Butler data repository.
1258 This creates any new exposure or visit Dimension entries needed to
1259 identify the ingested files, creates new Dataset entries in the
1260 Registry and finally ingests the files themselves into the Datastore.
1261 Any needed instrument, detector, and physical_filter Dimension entries
1262 must exist in the Registry before `run` is called.
1264 Parameters
1265 ----------
1266 files : iterable `lsst.resources.ResourcePath`, `str` or path-like
1267 Paths to the files to be ingested. Can refer to directories.
1268 Will be made absolute if they are not already.
1269 pool : `multiprocessing.Pool`, optional
1270 If not `None`, a process pool with which to parallelize some
1271 operations.
1272 processes : `int`, optional
1273 The number of processes to use. Ignored if ``pool`` is not `None`.
1274 run : `str`, optional
1275 Name of a RUN-type collection to write to, overriding
1276 the default derived from the instrument name.
1277 file_filter : `str` or `re.Pattern`, optional
1278 Pattern to use to discover files to ingest within directories.
1279 The default is to search for FITS files. The regex applies to
1280 files within the directory.
1281 group_files : `bool`, optional
1282 Group files by directory if they have been discovered in
1283 directories. Will not affect files explicitly provided.
1284 skip_existing_exposures : `bool`, optional
1285 If `True` (`False` is default), skip raws that have already been
1286 ingested (i.e. raws for which we already have a dataset with the
1287 same data ID in the target collection, even if from another file).
1288 Note that this is much slower than just not passing
1289 already-ingested files as inputs, because we still need to read and
1290 process metadata to identify which exposures to search for. It
1291 also will not work reliably if multiple processes are attempting to
1292 ingest raws from the same exposure concurrently, in that different
1293 processes may still attempt to ingest the same raw and conflict,
1294 causing a failure that prevents other raws from the same exposure
1295 from being ingested.
1296 update_exposure_records : `bool`, optional
1297 If `True` (`False` is default), update existing exposure records
1298 that conflict with the new ones instead of rejecting them. THIS IS
1299 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1300 KNOWN TO BE BAD. This should usually be combined with
1301 ``skip_existing_exposures=True``.
1302 track_file_attrs : `bool`, optional
1303 Control whether file attributes such as the size or checksum should
1304 be tracked by the datastore. Whether this parameter is honored
1305 depends on the specific datastore implentation.
1307 Returns
1308 -------
1309 refs : `list` of `lsst.daf.butler.DatasetRef`
1310 Dataset references for ingested raws.
1312 Notes
1313 -----
1314 This method inserts all datasets for an exposure within a transaction,
1315 guaranteeing that partial exposures are never ingested. The exposure
1316 dimension record is inserted with `Registry.syncDimensionData` first
1317 (in its own transaction), which inserts only if a record with the same
1318 primary key does not already exist. This allows different files within
1319 the same exposure to be ingested in different runs.
1320 """
1322 refs = []
1323 bad_files = []
1324 n_exposures = 0
1325 n_exposures_failed = 0
1326 n_ingests_failed = 0
1327 if group_files:
1328 for group in ResourcePath.findFileResources(files, file_filter, group_files):
1329 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1330 group,
1331 pool=pool,
1332 processes=processes,
1333 run=run,
1334 skip_existing_exposures=skip_existing_exposures,
1335 update_exposure_records=update_exposure_records,
1336 track_file_attrs=track_file_attrs,
1337 )
1338 refs.extend(new_refs)
1339 bad_files.extend(bad)
1340 n_exposures += n_exp
1341 n_exposures_failed += n_exp_fail
1342 n_ingests_failed += n_ingest_fail
1343 else:
1344 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1345 ResourcePath.findFileResources(files, file_filter, group_files),
1346 pool=pool,
1347 processes=processes,
1348 run=run,
1349 skip_existing_exposures=skip_existing_exposures,
1350 update_exposure_records=update_exposure_records,
1351 )
1353 had_failure = False
1355 if bad_files:
1356 had_failure = True
1357 self.log.warning("Could not extract observation metadata from the following:")
1358 for f in bad_files:
1359 self.log.warning("- %s", f)
1361 self.log.info(
1362 "Successfully processed data from %d exposure%s with %d failure%s from exposure"
1363 " registration and %d failure%s from file ingest.",
1364 *_log_msg_counter(n_exposures),
1365 *_log_msg_counter(n_exposures_failed),
1366 *_log_msg_counter(n_ingests_failed),
1367 )
1368 if n_exposures_failed > 0 or n_ingests_failed > 0:
1369 had_failure = True
1370 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1372 if had_failure:
1373 raise RuntimeError("Some failures encountered during ingestion")
1375 return refs