Coverage for python/lsst/obs/base/ingest.py: 17%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from collections import defaultdict
28from dataclasses import InitVar, dataclass
29from multiprocessing import Pool
30from typing import (
31 Any,
32 Callable,
33 ClassVar,
34 Dict,
35 Iterable,
36 Iterator,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Sized,
42 Tuple,
43 Type,
44 Union,
45)
47from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers
48from astro_metadata_translator.indexing import process_index_data, process_sidecar_data
49from lsst.afw.fits import readMetadata
50from lsst.daf.butler import (
51 Butler,
52 CollectionType,
53 DataCoordinate,
54 DatasetIdGenEnum,
55 DatasetRef,
56 DatasetType,
57 DimensionRecord,
58 DimensionUniverse,
59 FileDataset,
60 Formatter,
61 Progress,
62)
63from lsst.pex.config import ChoiceField, Config, Field
64from lsst.pipe.base import Instrument, Task
65from lsst.resources import ResourcePath, ResourcePathExpression
66from lsst.utils.timer import timeMethod
68from ._instrument import makeExposureRecordFromObsInfo
70# multiprocessing.Pool is actually a function, not a type, and the real type
71# isn't exposed, so we can't used it annotations, so we'll just punt on it via
72# this alias instead.
73PoolType = Any
76def _do_nothing(*args: Any, **kwargs: Any) -> None:
77 """Do nothing.
79 This is a function that accepts anything and does nothing.
80 For use as a default in callback arguments.
81 """
82 pass
85def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]:
86 """Count the iterable and return the count and plural modifier.
88 Parameters
89 ----------
90 noun : `Sized` or `int`
91 Thing to count. If given an integer it is assumed to be the count
92 to use to calculate modifier.
94 Returns
95 -------
96 num : `int`
97 Number of items found in ``noun``.
98 modifier : `str`
99 Character to add to the end of a string referring to these items
100 to indicate whether it was a single item or not. Returns empty
101 string if there is one item or "s" otherwise.
103 Examples
104 --------
106 .. code-block:: python
108 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
109 """
110 if isinstance(noun, int):
111 num = noun
112 else:
113 num = len(noun)
114 return num, "" if num == 1 else "s"
117@dataclass
118class RawFileDatasetInfo:
119 """Information about a single dataset within a raw file."""
121 dataId: DataCoordinate
122 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
124 obsInfo: ObservationInfo
125 """Standardized observation metadata extracted directly from the file
126 headers (`astro_metadata_translator.ObservationInfo`).
127 """
130@dataclass
131class RawFileData:
132 """Information about a single raw file, used during ingest."""
134 datasets: List[RawFileDatasetInfo]
135 """The information describing each dataset within this raw file.
136 (`list` of `RawFileDatasetInfo`)
137 """
139 filename: ResourcePath
140 """URI of the file this information was extracted from (`str`).
142 This is the path prior to ingest, not the path after ingest.
143 """
145 FormatterClass: Type[Formatter]
146 """Formatter class that should be used to ingest this file (`type`; as
147 subclass of `Formatter`).
148 """
150 instrument: Optional[Instrument]
151 """The `Instrument` instance associated with this file. Can be `None`
152 if ``datasets`` is an empty list."""
155@dataclass
156class RawExposureData:
157 """Information about a complete raw exposure, used during ingest."""
159 dataId: DataCoordinate
160 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
161 """
163 files: List[RawFileData]
164 """List of structures containing file-level information.
165 """
167 universe: InitVar[DimensionUniverse]
168 """Set of all known dimensions.
169 """
171 record: DimensionRecord
172 """The exposure `DimensionRecord` that must be inserted into the
173 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
174 """
176 dependencyRecords: Dict[str, DimensionRecord]
177 """Additional records that must be inserted into the
178 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record``
179 (e.g., to satisfy foreign key constraints), indexed by the dimension name.
180 """
183def makeTransferChoiceField(
184 doc: str = "How to transfer files (None for no transfer).", default: str = "auto"
185) -> ChoiceField:
186 """Create a Config field with options for transferring data between repos.
188 The allowed options for the field are exactly those supported by
189 `lsst.daf.butler.Datastore.ingest`.
191 Parameters
192 ----------
193 doc : `str`
194 Documentation for the configuration field.
195 default : `str`, optional
196 Default transfer mode for the field.
198 Returns
199 -------
200 field : `lsst.pex.config.ChoiceField`
201 Configuration field.
202 """
203 return ChoiceField(
204 doc=doc,
205 dtype=str,
206 allowed={
207 "move": "move",
208 "copy": "copy",
209 "auto": "choice will depend on datastore",
210 "direct": "use URI to ingested file directly in datastore",
211 "link": "hard link falling back to symbolic link",
212 "hardlink": "hard link",
213 "symlink": "symbolic (soft) link",
214 "relsymlink": "relative symbolic link",
215 },
216 optional=True,
217 default=default,
218 )
221class RawIngestConfig(Config):
222 """Configuration class for RawIngestTask."""
224 transfer = makeTransferChoiceField()
225 failFast = Field(
226 dtype=bool,
227 default=False,
228 doc="If True, stop ingest as soon as any problem is encountered with any file. "
229 "Otherwise problem files will be skipped and logged and a report issued at completion.",
230 )
233class RawIngestTask(Task):
234 """Driver Task for ingesting raw data into Gen3 Butler repositories.
236 Parameters
237 ----------
238 config : `RawIngestConfig`
239 Configuration for the task.
240 butler : `~lsst.daf.butler.Butler`
241 Writeable butler instance, with ``butler.run`` set to the appropriate
242 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
243 datasets.
244 on_success : `Callable`, optional
245 A callback invoked when all of the raws associated with an exposure
246 are ingested. Will be passed a list of `FileDataset` objects, each
247 containing one or more resolved `DatasetRef` objects. If this callback
248 raises it will interrupt the entire ingest process, even if
249 `RawIngestConfig.failFast` is `False`.
250 on_metadata_failure : `Callable`, optional
251 A callback invoked when a failure occurs trying to translate the
252 metadata for a file. Will be passed the URI and the exception, in
253 that order, as positional arguments. Guaranteed to be called in an
254 ``except`` block, allowing the callback to re-raise or replace (with
255 ``raise ... from``) to override the task's usual error handling (before
256 `RawIngestConfig.failFast` logic occurs).
257 on_ingest_failure : `Callable`, optional
258 A callback invoked when dimension record or dataset insertion into the
259 database fails for an exposure. Will be passed a `RawExposureData`
260 instance and the exception, in that order, as positional arguments.
261 Guaranteed to be called in an ``except`` block, allowing the callback
262 to re-raise or replace (with ``raise ... from``) to override the task's
263 usual error handling (before `RawIngestConfig.failFast` logic occurs).
264 **kwargs
265 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
266 constructor.
268 Notes
269 -----
270 Each instance of `RawIngestTask` writes to the same Butler. Each
271 invocation of `RawIngestTask.run` ingests a list of files.
272 """
274 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig
276 _DefaultName: ClassVar[str] = "ingest"
278 def getDatasetType(self) -> DatasetType:
279 """Return the DatasetType of the datasets ingested by this Task."""
280 return DatasetType(
281 "raw",
282 ("instrument", "detector", "exposure"),
283 "Exposure",
284 universe=self.butler.registry.dimensions,
285 )
287 def __init__(
288 self,
289 config: RawIngestConfig,
290 *,
291 butler: Butler,
292 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
293 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing,
294 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
295 **kwargs: Any,
296 ):
297 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
298 super().__init__(config, **kwargs)
299 self.butler = butler
300 self.universe = self.butler.registry.dimensions
301 self.datasetType = self.getDatasetType()
302 self._on_success = on_success
303 self._on_metadata_failure = on_metadata_failure
304 self._on_ingest_failure = on_ingest_failure
305 self.progress = Progress("obs.base.RawIngestTask")
307 # Import all the instrument classes so that we ensure that we
308 # have all the relevant metadata translators loaded.
309 Instrument.importAll(self.butler.registry)
311 def _reduce_kwargs(self) -> Dict[str, Any]:
312 # Add extra parameters to pickle.
313 return dict(
314 **super()._reduce_kwargs(),
315 butler=self.butler,
316 on_success=self._on_success,
317 on_metadata_failure=self._on_metadata_failure,
318 on_ingest_failure=self._on_ingest_failure,
319 )
321 def _determine_instrument_formatter(
322 self, dataId: DataCoordinate, filename: ResourcePath
323 ) -> Tuple[Optional[Instrument], Type[Formatter]]:
324 """Determine the instrument and formatter class.
326 Parameters
327 ----------
328 dataId : `lsst.daf.butler.DataCoordinate`
329 The dataId associated with this dataset.
330 filename : `lsst.resources.ResourcePath`
331 URI of file used for error reporting.
333 Returns
334 -------
335 instrument : `Instrument` or `None`
336 Instance of the `Instrument` associated with this dataset. `None`
337 indicates that the instrument could not be determined.
338 formatterClass : `type`
339 Class to be used as the formatter for this dataset.
340 """
341 # The data model currently assumes that whilst multiple datasets
342 # can be associated with a single file, they must all share the
343 # same formatter.
344 try:
345 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore
346 except LookupError as e:
347 self._on_metadata_failure(filename, e)
348 self.log.warning(
349 "Instrument %s for file %s not known to registry", dataId["instrument"], filename
350 )
351 if self.config.failFast:
352 raise RuntimeError(
353 f"Instrument {dataId['instrument']} for file {filename} not known to registry"
354 ) from e
355 FormatterClass = Formatter
356 # Indicate that we could not work out the instrument.
357 instrument = None
358 else:
359 assert instrument is not None, "Should be guaranted by fromName succeeding."
360 FormatterClass = instrument.getRawFormatter(dataId)
361 return instrument, FormatterClass
363 def extractMetadata(self, filename: ResourcePath) -> RawFileData:
364 """Extract and process metadata from a single raw file.
366 Parameters
367 ----------
368 filename : `lsst.resources.ResourcePath`
369 URI to the file.
371 Returns
372 -------
373 data : `RawFileData`
374 A structure containing the metadata extracted from the file,
375 as well as the original filename. All fields will be populated,
376 but the `RawFileData.dataId` attribute will be a minimal
377 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
378 ``instrument`` field will be `None` if there is a problem
379 with metadata extraction.
381 Notes
382 -----
383 Assumes that there is a single dataset associated with the given
384 file. Instruments using a single file to store multiple datasets
385 must implement their own version of this method.
387 By default the method will catch all exceptions unless the ``failFast``
388 configuration item is `True`. If an error is encountered the
389 `_on_metadata_failure()` method will be called. If no exceptions
390 result and an error was encountered the returned object will have
391 a null-instrument class and no datasets.
393 This method supports sidecar JSON files which can be used to
394 extract metadata without having to read the data file itself.
395 The sidecar file is always used if found.
396 """
397 sidecar_fail_msg = "" # Requires prepended space when set.
398 try:
399 sidecar_file = filename.updatedExtension(".json")
400 if sidecar_file.exists():
401 content = json.loads(sidecar_file.read())
402 headers = [process_sidecar_data(content)]
403 sidecar_fail_msg = " (via sidecar)"
404 else:
405 # Read the metadata from the data file itself.
407 # For remote files download the entire file to get the
408 # header. This is very inefficient and it would be better
409 # to have some way of knowing where in the file the headers
410 # are and to only download those parts of the file.
411 with filename.as_local() as local_file:
412 # Read the primary. This might be sufficient.
413 header = readMetadata(local_file.ospath, 0)
415 try:
416 # Try to work out a translator class early.
417 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
418 except ValueError:
419 # Primary header was not sufficient (maybe this file
420 # has been compressed or is a MEF with minimal
421 # primary). Read second header and merge with primary.
422 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
424 # Try again to work out a translator class, letting this
425 # fail.
426 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
428 # Request the headers to use for ingest
429 headers = translator_class.determine_translatable_headers(filename.ospath, header)
431 # Add each header to the dataset list
432 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
434 except Exception as e:
435 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
436 # Indicate to the caller that we failed to read.
437 datasets = []
438 formatterClass = Formatter
439 instrument = None
440 self._on_metadata_failure(filename, e)
441 if self.config.failFast:
442 raise RuntimeError(
443 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}"
444 ) from e
445 else:
446 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
447 # The data model currently assumes that whilst multiple datasets
448 # can be associated with a single file, they must all share the
449 # same formatter.
450 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
451 if instrument is None:
452 datasets = []
454 return RawFileData(
455 datasets=datasets,
456 filename=filename,
457 # MyPy wants this to be a non-abstract class, which is not true
458 # for the error case where instrument is None and datasets=[].
459 FormatterClass=formatterClass, # type: ignore
460 instrument=instrument,
461 )
463 @classmethod
464 def getObservationInfoSubsets(cls) -> Tuple[Set, Set]:
465 """Return subsets of fields in the `ObservationInfo` that we care about
467 These fields will be used in constructing an exposure record.
469 Returns
470 -------
471 required : `set`
472 Set of `ObservationInfo` field names that are required.
473 optional : `set`
474 Set of `ObservationInfo` field names we will use if they are
475 available.
476 """
477 required = {
478 "datetime_begin",
479 "datetime_end",
480 "detector_num",
481 "exposure_id",
482 "exposure_time",
483 "instrument",
484 "observation_id",
485 "observation_type",
486 "physical_filter",
487 }
488 optional = {
489 "altaz_begin",
490 "boresight_rotation_coord",
491 "boresight_rotation_angle",
492 "dark_time",
493 "exposure_group",
494 "tracking_radec",
495 "object",
496 "observation_counter",
497 "observation_reason",
498 "observing_day",
499 "science_program",
500 "visit_id",
501 }
502 return required, optional
504 def _calculate_dataset_info(
505 self, header: Union[Mapping[str, Any], ObservationInfo], filename: ResourcePath
506 ) -> RawFileDatasetInfo:
507 """Calculate a RawFileDatasetInfo from the supplied information.
509 Parameters
510 ----------
511 header : Mapping or `astro_metadata_translator.ObservationInfo`
512 Header from the dataset or previously-translated content.
513 filename : `lsst.resources.ResourcePath`
514 Filename to use for error messages.
516 Returns
517 -------
518 dataset : `RawFileDatasetInfo`
519 The dataId, and observation information associated with this
520 dataset.
521 """
522 required, optional = self.getObservationInfoSubsets()
523 if isinstance(header, ObservationInfo):
524 obsInfo = header
525 missing = []
526 # Need to check the required properties are present.
527 for property in required:
528 # getattr does not need to be protected because it is using
529 # the defined list above containing properties that must exist.
530 value = getattr(obsInfo, property)
531 if value is None:
532 missing.append(property)
533 if missing:
534 raise ValueError(
535 f"Requested required properties are missing from file {filename}:"
536 f" {missing} (via JSON)"
537 )
539 else:
540 obsInfo = ObservationInfo(
541 header,
542 pedantic=False,
543 filename=str(filename),
544 required=required,
545 subset=required | optional,
546 )
548 dataId = DataCoordinate.standardize(
549 instrument=obsInfo.instrument,
550 exposure=obsInfo.exposure_id,
551 detector=obsInfo.detector_num,
552 universe=self.universe,
553 )
554 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
556 def locateAndReadIndexFiles(
557 self, files: Iterable[ResourcePath]
558 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]:
559 """Given a list of files, look for index files and read them.
561 Index files can either be explicitly in the list of files to
562 ingest, or else located in the same directory as a file to ingest.
563 Index entries are always used if present.
565 Parameters
566 ----------
567 files : iterable over `lsst.resources.ResourcePath`
568 URIs to the files to be ingested.
570 Returns
571 -------
572 index : `dict` [`ResourcePath`, Any]
573 Merged contents of all relevant index files found. These can
574 be explicitly specified index files or ones found in the
575 directory alongside a data file to be ingested.
576 updated_files : `list` of `ResourcePath`
577 Updated list of the input files with entries removed that were
578 found listed in an index file. Order is not guaranteed to
579 match the order of the files given to this routine.
580 good_index_files: `set` [ `ResourcePath` ]
581 Index files that were successfully read.
582 bad_index_files: `set` [ `ResourcePath` ]
583 Files that looked like index files but failed to read properly.
584 """
585 # Convert the paths to absolute for easy comparison with index content.
586 # Do not convert to real paths since we have to assume that index
587 # files are in this location and not the location which it links to.
588 files = tuple(f.abspath() for f in files)
590 # Index files must be named this.
591 index_root_file = "_index.json"
593 # Group the files by directory.
594 files_by_directory = defaultdict(set)
596 for path in files:
597 directory, file_in_dir = path.split()
598 files_by_directory[directory].add(file_in_dir)
600 # All the metadata read from index files with keys of full path.
601 index_entries: Dict[ResourcePath, Any] = {}
603 # Index files we failed to read.
604 bad_index_files = set()
606 # Any good index files that were found and used.
607 good_index_files = set()
609 # Look for index files in those directories.
610 for directory, files_in_directory in files_by_directory.items():
611 possible_index_file = directory.join(index_root_file)
612 if possible_index_file.exists():
613 # If we are explicitly requesting an index file the
614 # messages should be different.
615 index_msg = "inferred"
616 is_implied = True
617 if index_root_file in files_in_directory:
618 index_msg = "explicit"
619 is_implied = False
621 # Try to read the index file and catch and report any
622 # problems.
623 try:
624 content = json.loads(possible_index_file.read())
625 index = process_index_data(content, force_dict=True)
626 except Exception as e:
627 # Only trigger the callback if the index file
628 # was asked for explicitly. Triggering on implied file
629 # might be surprising.
630 if not is_implied:
631 self._on_metadata_failure(possible_index_file, e)
632 if self.config.failFast:
633 raise RuntimeError(
634 f"Problem reading index file from {index_msg} location {possible_index_file}"
635 ) from e
636 bad_index_files.add(possible_index_file)
637 continue
639 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
640 good_index_files.add(possible_index_file)
642 # Go through the index adding entries for files.
643 # If we have non-index files in this directory marked for
644 # ingest we should only get index information for those.
645 # If the index file was explicit we use all entries.
646 if is_implied:
647 files_to_ingest = files_in_directory
648 else:
649 files_to_ingest = set(index)
651 # Copy relevant metadata into a single dict for all index
652 # entries.
653 for file_in_dir in files_to_ingest:
654 # Skip an explicitly specified index file.
655 # This should never happen because an explicit index
656 # file will force ingest of all files in the index
657 # and not use the explicit file list. If somehow
658 # this is not true we continue. Raising an exception
659 # seems like the wrong thing to do since this is harmless.
660 if file_in_dir == index_root_file:
661 self.log.info(
662 "Logic error found scanning directory %s. Please file ticket.", directory
663 )
664 continue
665 if file_in_dir in index:
666 file = directory.join(file_in_dir)
667 if file in index_entries:
668 # ObservationInfo overrides raw metadata
669 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance(
670 index_entries[file], ObservationInfo
671 ):
672 self.log.warning(
673 "File %s already specified in an index file but overriding"
674 " with ObservationInfo content from %s",
675 file,
676 possible_index_file,
677 )
678 else:
679 self.log.warning(
680 "File %s already specified in an index file, ignoring content from %s",
681 file,
682 possible_index_file,
683 )
684 # Do nothing in this case
685 continue
687 index_entries[file] = index[file_in_dir]
689 # Remove files from list that have index entries and also
690 # any files that we determined to be explicit index files
691 # or any index files that we failed to read.
692 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
694 # The filtered list loses the initial order. Retaining the order
695 # is good for testing but does have a cost if there are many
696 # files when copying the good values out. A dict would have faster
697 # lookups (using the files as keys) but use more memory.
698 ordered = [f for f in filtered if f in files]
700 return index_entries, ordered, good_index_files, bad_index_files
702 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]:
703 """Convert index entries to RawFileData.
705 Parameters
706 ----------
707 index_entries : `dict` [`ResourcePath`, Any]
708 Dict indexed by name of file to ingest and with keys either
709 raw metadata or translated
710 `~astro_metadata_translator.ObservationInfo`.
712 Returns
713 -------
714 data : `list` [ `RawFileData` ]
715 Structures containing the metadata extracted from the file,
716 as well as the original filename. All fields will be populated,
717 but the `RawFileData.dataId` attributes will be minimal
718 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances.
719 """
720 fileData = []
721 for filename, metadata in index_entries.items():
722 try:
723 datasets = [self._calculate_dataset_info(metadata, filename)]
724 except Exception as e:
725 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e)
726 datasets = []
727 formatterClass = Formatter
728 instrument = None
729 self._on_metadata_failure(filename, e)
730 if self.config.failFast:
731 raise RuntimeError(
732 f"Problem extracting metadata for file {filename} found in index file"
733 ) from e
734 else:
735 instrument, formatterClass = self._determine_instrument_formatter(
736 datasets[0].dataId, filename
737 )
738 if instrument is None:
739 datasets = []
740 fileData.append(
741 RawFileData(
742 datasets=datasets,
743 filename=filename,
744 # MyPy wants this to be a non-abstract class, which is not
745 # true for the error case where instrument is None and
746 # datasets=[].
747 FormatterClass=formatterClass, # type: ignore
748 instrument=instrument,
749 )
750 )
751 return fileData
753 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
754 """Group an iterable of `RawFileData` by exposure.
756 Parameters
757 ----------
758 files : iterable of `RawFileData`
759 File-level information to group.
761 Returns
762 -------
763 exposures : `list` of `RawExposureData`
764 A list of structures that group the file-level information by
765 exposure. All fields will be populated. The
766 `RawExposureData.dataId` attributes will be minimal (unexpanded)
767 `~lsst.daf.butler.DataCoordinate` instances.
768 """
769 exposureDimensions = self.universe["exposure"].graph
770 byExposure = defaultdict(list)
771 for f in files:
772 # Assume that the first dataset is representative for the file.
773 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
775 return [
776 RawExposureData(
777 dataId=dataId,
778 files=exposureFiles,
779 universe=self.universe,
780 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe),
781 dependencyRecords=self.makeDependencyRecords(
782 exposureFiles[0].datasets[0].obsInfo, self.universe
783 ),
784 )
785 for dataId, exposureFiles in byExposure.items()
786 ]
788 def makeExposureRecord(
789 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any
790 ) -> DimensionRecord:
791 """Construct a registry record for an exposure
793 This is a method that subclasses will often want to customize. This can
794 often be done by calling this base class implementation with additional
795 ``kwargs``.
797 Parameters
798 ----------
799 obsInfo : `ObservationInfo`
800 Observation details for (one of the components of) the exposure.
801 universe : `DimensionUniverse`
802 Set of all known dimensions.
803 **kwargs
804 Additional field values for this record.
806 Returns
807 -------
808 record : `DimensionRecord`
809 The exposure record that must be inserted into the
810 `~lsst.daf.butler.Registry` prior to file-level ingest.
811 """
812 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs)
814 def makeDependencyRecords(
815 self, obsInfo: ObservationInfo, universe: DimensionUniverse
816 ) -> Dict[str, DimensionRecord]:
817 """Construct dependency records
819 These dependency records will be inserted into the
820 `~lsst.daf.butler.Registry` before the exposure records, because they
821 are dependencies of the exposure. This allows an opportunity to satisfy
822 foreign key constraints that exist because of dimensions related to the
823 exposure.
825 This is a method that subclasses may want to customize, if they've
826 added dimensions that relate to an exposure.
828 Parameters
829 ----------
830 obsInfo : `ObservationInfo`
831 Observation details for (one of the components of) the exposure.
832 universe : `DimensionUniverse`
833 Set of all known dimensions.
835 Returns
836 -------
837 records : `dict` [`str`, `DimensionRecord`]
838 The records to insert, indexed by dimension name.
839 """
840 return {}
842 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
843 """Expand the data IDs associated with a raw exposure.
845 This adds the metadata records.
847 Parameters
848 ----------
849 exposure : `RawExposureData`
850 A structure containing information about the exposure to be
851 ingested. Must have `RawExposureData.record` populated. Should
852 be considered consumed upon return.
854 Returns
855 -------
856 exposure : `RawExposureData`
857 An updated version of the input structure, with
858 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
859 updated to data IDs for which
860 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
861 """
862 # We start by expanded the exposure-level data ID; we won't use that
863 # directly in file ingest, but this lets us do some database lookups
864 # once per exposure instead of once per file later.
865 data.dataId = self.butler.registry.expandDataId(
866 data.dataId,
867 # We pass in the records we'll be inserting shortly so they aren't
868 # looked up from the database. We do expect instrument and filter
869 # records to be retrieved from the database here (though the
870 # Registry may cache them so there isn't a lookup every time).
871 records={"exposure": data.record},
872 )
873 # Now we expand the per-file (exposure+detector) data IDs. This time
874 # we pass in the records we just retrieved from the exposure data ID
875 # expansion.
876 for file in data.files:
877 for dataset in file.datasets:
878 dataset.dataId = self.butler.registry.expandDataId(
879 dataset.dataId, records=data.dataId.records
880 )
881 return data
883 def prep(
884 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None, processes: int = 1
885 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]:
886 """Perform all non-database-updating ingest preprocessing steps.
888 Parameters
889 ----------
890 files : iterable over `str` or path-like objects
891 Paths to the files to be ingested. Will be made absolute
892 if they are not already.
893 pool : `multiprocessing.Pool`, optional
894 If not `None`, a process pool with which to parallelize some
895 operations.
896 processes : `int`, optional
897 The number of processes to use. Ignored if ``pool`` is not `None`.
899 Returns
900 -------
901 exposures : `Iterator` [ `RawExposureData` ]
902 Data structures containing dimension records, filenames, and data
903 IDs to be ingested (one structure for each exposure).
904 bad_files : `list` of `str`
905 List of all the files that could not have metadata extracted.
906 """
907 if pool is None and processes > 1:
908 pool = Pool(processes)
909 mapFunc = map if pool is None else pool.imap_unordered
911 def _partition_good_bad(
912 file_data: Iterable[RawFileData],
913 ) -> Tuple[List[RawFileData], List[ResourcePath]]:
914 """Filter out bad files and return good with list of bad."""
915 good_files = []
916 bad_files = []
917 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"):
918 if not fileDatum.datasets:
919 bad_files.append(fileDatum.filename)
920 else:
921 good_files.append(fileDatum)
922 return good_files, bad_files
924 # Look for index files and read them.
925 # There should be far fewer index files than data files.
926 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
927 if bad_index_files:
928 self.log.info("Failed to read the following explicitly requested index files:")
929 for bad in sorted(bad_index_files):
930 self.log.info("- %s", bad)
932 # Now convert all the index file entries to standard form for ingest.
933 processed_bad_index_files: List[ResourcePath] = []
934 indexFileData = self.processIndexEntries(index_entries)
935 if indexFileData:
936 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData)
937 self.log.info(
938 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s",
939 *_log_msg_counter(indexFileData),
940 *_log_msg_counter(good_index_files),
941 *_log_msg_counter(processed_bad_index_files),
942 )
944 # Extract metadata and build per-detector regions.
945 # This could run in a subprocess so collect all output
946 # before looking at failures.
947 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
949 # Filter out all the failed reads and store them for later
950 # reporting.
951 good_file_data, bad_files = _partition_good_bad(fileData)
952 self.log.info(
953 "Successfully extracted metadata from %d file%s with %d failure%s",
954 *_log_msg_counter(good_file_data),
955 *_log_msg_counter(bad_files),
956 )
958 # Combine with data from index files.
959 good_file_data.extend(indexFileData)
960 bad_files.extend(processed_bad_index_files)
961 bad_files.extend(bad_index_files)
963 # Use that metadata to group files (and extracted metadata) by
964 # exposure. Never parallelized because it's intrinsically a gather
965 # step.
966 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data)
968 # The next operation operates on RawExposureData instances (one at
969 # a time) in-place and then returns the modified instance. We call it
970 # as a pass-through instead of relying on the arguments we pass in to
971 # have been modified because in the parallel case those arguments are
972 # going to be pickled and unpickled, and I'm not certain
973 # multiprocessing is careful enough with that for output arguments to
974 # work.
976 # Expand the data IDs to include all dimension metadata; we need this
977 # because we may need to generate path templates that rely on that
978 # metadata.
979 # This is the first step that involves actual database calls (but just
980 # SELECTs), so if there's going to be a problem with connections vs.
981 # multiple processes, or lock contention (in SQLite) slowing things
982 # down, it'll happen here.
983 return mapFunc(self.expandDataIds, exposureData), bad_files
985 def ingestExposureDatasets(
986 self,
987 exposure: RawExposureData,
988 *,
989 run: Optional[str] = None,
990 skip_existing_exposures: bool = False,
991 track_file_attrs: bool = True,
992 ) -> List[FileDataset]:
993 """Ingest all raw files in one exposure.
995 Parameters
996 ----------
997 exposure : `RawExposureData`
998 A structure containing information about the exposure to be
999 ingested. Must have `RawExposureData.records` populated and all
1000 data ID attributes expanded.
1001 run : `str`, optional
1002 Name of a RUN-type collection to write to, overriding
1003 ``self.butler.run``.
1004 skip_existing_exposures : `bool`, optional
1005 If `True` (`False` is default), skip raws that have already been
1006 ingested (i.e. raws for which we already have a dataset with the
1007 same data ID in the target collection, even if from another file).
1008 Note that this is much slower than just not passing
1009 already-ingested files as inputs, because we still need to read and
1010 process metadata to identify which exposures to search for. It
1011 also will not work reliably if multiple processes are attempting to
1012 ingest raws from the same exposure concurrently, in that different
1013 processes may still attempt to ingest the same raw and conflict,
1014 causing a failure that prevents other raws from the same exposure
1015 from being ingested.
1016 track_file_attrs : `bool`, optional
1017 Control whether file attributes such as the size or checksum should
1018 be tracked by the datastore. Whether this parameter is honored
1019 depends on the specific datastore implentation.
1021 Returns
1022 -------
1023 datasets : `list` of `lsst.daf.butler.FileDataset`
1024 Per-file structures identifying the files ingested and their
1025 dataset representation in the data repository.
1026 """
1027 if skip_existing_exposures:
1028 existing = {
1029 ref.dataId
1030 for ref in self.butler.registry.queryDatasets(
1031 self.datasetType,
1032 collections=[run],
1033 dataId=exposure.dataId,
1034 )
1035 }
1036 else:
1037 existing = set()
1038 datasets = []
1039 for file in exposure.files:
1040 refs = [DatasetRef(self.datasetType, d.dataId) for d in file.datasets if d.dataId not in existing]
1041 if refs:
1042 datasets.append(
1043 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass)
1044 )
1046 # Raw files are preferentially ingested using a UUID derived from
1047 # the collection name and dataId.
1048 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN):
1049 mode = DatasetIdGenEnum.DATAID_TYPE_RUN
1050 else:
1051 mode = DatasetIdGenEnum.UNIQUE
1052 self.butler.ingest(
1053 *datasets,
1054 transfer=self.config.transfer,
1055 run=run,
1056 idGenerationMode=mode,
1057 record_validation_info=track_file_attrs,
1058 )
1059 return datasets
1061 def ingestFiles(
1062 self,
1063 files: Iterable[ResourcePath],
1064 *,
1065 pool: Optional[PoolType] = None,
1066 processes: int = 1,
1067 run: Optional[str] = None,
1068 skip_existing_exposures: bool = False,
1069 update_exposure_records: bool = False,
1070 track_file_attrs: bool = True,
1071 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]:
1072 """Ingest files into a Butler data repository.
1074 This creates any new exposure or visit Dimension entries needed to
1075 identify the ingested files, creates new Dataset entries in the
1076 Registry and finally ingests the files themselves into the Datastore.
1077 Any needed instrument, detector, and physical_filter Dimension entries
1078 must exist in the Registry before `run` is called.
1080 Parameters
1081 ----------
1082 files : iterable over `lsst.resources.ResourcePath`
1083 URIs to the files to be ingested.
1084 pool : `multiprocessing.Pool`, optional
1085 If not `None`, a process pool with which to parallelize some
1086 operations.
1087 processes : `int`, optional
1088 The number of processes to use. Ignored if ``pool`` is not `None`.
1089 run : `str`, optional
1090 Name of a RUN-type collection to write to, overriding
1091 the default derived from the instrument name.
1092 skip_existing_exposures : `bool`, optional
1093 If `True` (`False` is default), skip raws that have already been
1094 ingested (i.e. raws for which we already have a dataset with the
1095 same data ID in the target collection, even if from another file).
1096 Note that this is much slower than just not passing
1097 already-ingested files as inputs, because we still need to read and
1098 process metadata to identify which exposures to search for. It
1099 also will not work reliably if multiple processes are attempting to
1100 ingest raws from the same exposure concurrently, in that different
1101 processes may still attempt to ingest the same raw and conflict,
1102 causing a failure that prevents other raws from the same exposure
1103 from being ingested.
1104 update_exposure_records : `bool`, optional
1105 If `True` (`False` is default), update existing exposure records
1106 that conflict with the new ones instead of rejecting them. THIS IS
1107 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1108 KNOWN TO BE BAD. This should usually be combined with
1109 ``skip_existing_exposures=True``.
1110 track_file_attrs : `bool`, optional
1111 Control whether file attributes such as the size or checksum should
1112 be tracked by the datastore. Whether this parameter is honored
1113 depends on the specific datastore implentation.
1115 Returns
1116 -------
1117 refs : `list` of `lsst.daf.butler.DatasetRef`
1118 Dataset references for ingested raws.
1119 bad_files : `list` of `ResourcePath`
1120 Given paths that could not be ingested.
1121 n_exposures : `int`
1122 Number of exposures successfully ingested.
1123 n_exposures_failed : `int`
1124 Number of exposures that failed when inserting dimension data.
1125 n_ingests_failed : `int`
1126 Number of exposures that failed when ingesting raw datasets.
1127 """
1129 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
1131 # Up to this point, we haven't modified the data repository at all.
1132 # Now we finally do that, with one transaction per exposure. This is
1133 # not parallelized at present because the performance of this step is
1134 # limited by the database server. That may or may not change in the
1135 # future once we increase our usage of bulk inserts and reduce our
1136 # usage of savepoints; we've tried to get everything but the database
1137 # operations done in advance to reduce the time spent inside
1138 # transactions.
1139 self.butler.registry.registerDatasetType(self.datasetType)
1141 refs = []
1142 runs = set()
1143 n_exposures = 0
1144 n_exposures_failed = 0
1145 n_ingests_failed = 0
1146 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
1147 assert exposure.record is not None, "Should be guaranteed by prep()"
1148 self.log.debug(
1149 "Attempting to ingest %d file%s from exposure %s:%s",
1150 *_log_msg_counter(exposure.files),
1151 exposure.record.instrument,
1152 exposure.record.obs_id,
1153 )
1155 try:
1156 for name, record in exposure.dependencyRecords.items():
1157 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records)
1158 inserted_or_updated = self.butler.registry.syncDimensionData(
1159 "exposure",
1160 exposure.record,
1161 update=update_exposure_records,
1162 )
1163 except Exception as e:
1164 self._on_ingest_failure(exposure, e)
1165 n_exposures_failed += 1
1166 self.log.warning(
1167 "Exposure %s:%s could not be registered: %s",
1168 exposure.record.instrument,
1169 exposure.record.obs_id,
1170 e,
1171 )
1172 if self.config.failFast:
1173 raise e
1174 continue
1176 if isinstance(inserted_or_updated, dict):
1177 # Exposure is in the registry and we updated it, so
1178 # syncDimensionData returned a dict.
1179 self.log.info(
1180 "Exposure %s:%s was already present, but columns %s were updated.",
1181 exposure.record.instrument,
1182 exposure.record.obs_id,
1183 str(list(inserted_or_updated.keys())),
1184 )
1186 # Override default run if nothing specified explicitly.
1187 if run is None:
1188 instrument = exposure.files[0].instrument
1189 assert (
1190 instrument is not None
1191 ), "file should have been removed from this list by prep if instrument could not be found"
1192 this_run = instrument.makeDefaultRawIngestRunName()
1193 else:
1194 this_run = run
1195 if this_run not in runs:
1196 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
1197 runs.add(this_run)
1198 try:
1199 datasets_for_exposure = self.ingestExposureDatasets(
1200 exposure,
1201 run=this_run,
1202 skip_existing_exposures=skip_existing_exposures,
1203 track_file_attrs=track_file_attrs,
1204 )
1205 except Exception as e:
1206 self._on_ingest_failure(exposure, e)
1207 n_ingests_failed += 1
1208 self.log.warning("Failed to ingest the following for reason: %s", e)
1209 for f in exposure.files:
1210 self.log.warning("- %s", f.filename)
1211 if self.config.failFast:
1212 raise e
1213 continue
1214 else:
1215 self._on_success(datasets_for_exposure)
1216 for dataset in datasets_for_exposure:
1217 refs.extend(dataset.refs)
1219 # Success for this exposure.
1220 n_exposures += 1
1221 self.log.info(
1222 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id
1223 )
1225 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
1227 @timeMethod
1228 def run(
1229 self,
1230 files: Iterable[ResourcePathExpression],
1231 *,
1232 pool: Optional[PoolType] = None,
1233 processes: int = 1,
1234 run: Optional[str] = None,
1235 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b",
1236 group_files: bool = True,
1237 skip_existing_exposures: bool = False,
1238 update_exposure_records: bool = False,
1239 track_file_attrs: bool = True,
1240 ) -> List[DatasetRef]:
1241 """Ingest files into a Butler data repository.
1243 This creates any new exposure or visit Dimension entries needed to
1244 identify the ingested files, creates new Dataset entries in the
1245 Registry and finally ingests the files themselves into the Datastore.
1246 Any needed instrument, detector, and physical_filter Dimension entries
1247 must exist in the Registry before `run` is called.
1249 Parameters
1250 ----------
1251 files : iterable `lsst.resources.ResourcePath`, `str` or path-like
1252 Paths to the files to be ingested. Can refer to directories.
1253 Will be made absolute if they are not already.
1254 pool : `multiprocessing.Pool`, optional
1255 If not `None`, a process pool with which to parallelize some
1256 operations.
1257 processes : `int`, optional
1258 The number of processes to use. Ignored if ``pool`` is not `None`.
1259 run : `str`, optional
1260 Name of a RUN-type collection to write to, overriding
1261 the default derived from the instrument name.
1262 file_filter : `str` or `re.Pattern`, optional
1263 Pattern to use to discover files to ingest within directories.
1264 The default is to search for FITS files. The regex applies to
1265 files within the directory.
1266 group_files : `bool`, optional
1267 Group files by directory if they have been discovered in
1268 directories. Will not affect files explicitly provided.
1269 skip_existing_exposures : `bool`, optional
1270 If `True` (`False` is default), skip raws that have already been
1271 ingested (i.e. raws for which we already have a dataset with the
1272 same data ID in the target collection, even if from another file).
1273 Note that this is much slower than just not passing
1274 already-ingested files as inputs, because we still need to read and
1275 process metadata to identify which exposures to search for. It
1276 also will not work reliably if multiple processes are attempting to
1277 ingest raws from the same exposure concurrently, in that different
1278 processes may still attempt to ingest the same raw and conflict,
1279 causing a failure that prevents other raws from the same exposure
1280 from being ingested.
1281 update_exposure_records : `bool`, optional
1282 If `True` (`False` is default), update existing exposure records
1283 that conflict with the new ones instead of rejecting them. THIS IS
1284 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1285 KNOWN TO BE BAD. This should usually be combined with
1286 ``skip_existing_exposures=True``.
1287 track_file_attrs : `bool`, optional
1288 Control whether file attributes such as the size or checksum should
1289 be tracked by the datastore. Whether this parameter is honored
1290 depends on the specific datastore implentation.
1292 Returns
1293 -------
1294 refs : `list` of `lsst.daf.butler.DatasetRef`
1295 Dataset references for ingested raws.
1297 Notes
1298 -----
1299 This method inserts all datasets for an exposure within a transaction,
1300 guaranteeing that partial exposures are never ingested. The exposure
1301 dimension record is inserted with `Registry.syncDimensionData` first
1302 (in its own transaction), which inserts only if a record with the same
1303 primary key does not already exist. This allows different files within
1304 the same exposure to be ingested in different runs.
1305 """
1307 refs = []
1308 bad_files = []
1309 n_exposures = 0
1310 n_exposures_failed = 0
1311 n_ingests_failed = 0
1312 if group_files:
1313 for group in ResourcePath.findFileResources(files, file_filter, group_files):
1314 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1315 group,
1316 pool=pool,
1317 processes=processes,
1318 run=run,
1319 skip_existing_exposures=skip_existing_exposures,
1320 update_exposure_records=update_exposure_records,
1321 track_file_attrs=track_file_attrs,
1322 )
1323 refs.extend(new_refs)
1324 bad_files.extend(bad)
1325 n_exposures += n_exp
1326 n_exposures_failed += n_exp_fail
1327 n_ingests_failed += n_ingest_fail
1328 else:
1329 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1330 ResourcePath.findFileResources(files, file_filter, group_files),
1331 pool=pool,
1332 processes=processes,
1333 run=run,
1334 skip_existing_exposures=skip_existing_exposures,
1335 update_exposure_records=update_exposure_records,
1336 )
1338 had_failure = False
1340 if bad_files:
1341 had_failure = True
1342 self.log.warning("Could not extract observation metadata from the following:")
1343 for f in bad_files:
1344 self.log.warning("- %s", f)
1346 self.log.info(
1347 "Successfully processed data from %d exposure%s with %d failure%s from exposure"
1348 " registration and %d failure%s from file ingest.",
1349 *_log_msg_counter(n_exposures),
1350 *_log_msg_counter(n_exposures_failed),
1351 *_log_msg_counter(n_ingests_failed),
1352 )
1353 if n_exposures_failed > 0 or n_ingests_failed > 0:
1354 had_failure = True
1355 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1357 if had_failure:
1358 raise RuntimeError("Some failures encountered during ingestion")
1360 return refs