Coverage for python/lsst/obs/base/ingest.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from collections import defaultdict
28from dataclasses import InitVar, dataclass
29from multiprocessing import Pool
30from typing import (
31 Any,
32 Callable,
33 ClassVar,
34 Dict,
35 Iterable,
36 Iterator,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Sized,
42 Tuple,
43 Type,
44 Union,
45)
47from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers
48from astro_metadata_translator.indexing import process_index_data, process_sidecar_data
49from lsst.afw.fits import readMetadata
50from lsst.daf.butler import (
51 Butler,
52 CollectionType,
53 DataCoordinate,
54 DatasetIdGenEnum,
55 DatasetRef,
56 DatasetType,
57 DimensionRecord,
58 DimensionUniverse,
59 FileDataset,
60 Formatter,
61 Progress,
62)
63from lsst.pex.config import ChoiceField, Config, Field
64from lsst.pipe.base import Instrument, Task
65from lsst.resources import ResourcePath, ResourcePathExpression
66from lsst.utils.timer import timeMethod
68from ._instrument import makeExposureRecordFromObsInfo
70# multiprocessing.Pool is actually a function, not a type, and the real type
71# isn't exposed, so we can't used it annotations, so we'll just punt on it via
72# this alias instead.
73PoolType = Any
76def _do_nothing(*args: Any, **kwargs: Any) -> None:
77 """Do nothing.
79 This is a function that accepts anything and does nothing.
80 For use as a default in callback arguments.
81 """
82 pass
85def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]:
86 """Count the iterable and return the count and plural modifier.
88 Parameters
89 ----------
90 noun : `Sized` or `int`
91 Thing to count. If given an integer it is assumed to be the count
92 to use to calculate modifier.
94 Returns
95 -------
96 num : `int`
97 Number of items found in ``noun``.
98 modifier : `str`
99 Character to add to the end of a string referring to these items
100 to indicate whether it was a single item or not. Returns empty
101 string if there is one item or "s" otherwise.
103 Examples
104 --------
106 .. code-block:: python
108 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
109 """
110 if isinstance(noun, int):
111 num = noun
112 else:
113 num = len(noun)
114 return num, "" if num == 1 else "s"
117@dataclass
118class RawFileDatasetInfo:
119 """Information about a single dataset within a raw file."""
121 dataId: DataCoordinate
122 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
124 obsInfo: ObservationInfo
125 """Standardized observation metadata extracted directly from the file
126 headers (`astro_metadata_translator.ObservationInfo`).
127 """
130@dataclass
131class RawFileData:
132 """Information about a single raw file, used during ingest."""
134 datasets: List[RawFileDatasetInfo]
135 """The information describing each dataset within this raw file.
136 (`list` of `RawFileDatasetInfo`)
137 """
139 filename: ResourcePath
140 """URI of the file this information was extracted from (`str`).
142 This is the path prior to ingest, not the path after ingest.
143 """
145 FormatterClass: Type[Formatter]
146 """Formatter class that should be used to ingest this file (`type`; as
147 subclass of `Formatter`).
148 """
150 instrument: Optional[Instrument]
151 """The `Instrument` instance associated with this file. Can be `None`
152 if ``datasets`` is an empty list."""
155@dataclass
156class RawExposureData:
157 """Information about a complete raw exposure, used during ingest."""
159 dataId: DataCoordinate
160 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
161 """
163 files: List[RawFileData]
164 """List of structures containing file-level information.
165 """
167 universe: InitVar[DimensionUniverse]
168 """Set of all known dimensions.
169 """
171 record: Optional[DimensionRecord] = None
172 """The exposure `DimensionRecord` that must be inserted into the
173 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
174 """
176 def __post_init__(self, universe: DimensionUniverse) -> None:
177 # We don't care which file or dataset we read metadata from, because
178 # we're assuming they'll all be the same; just use the first ones.
179 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
182def makeTransferChoiceField(
183 doc: str = "How to transfer files (None for no transfer).", default: str = "auto"
184) -> ChoiceField:
185 """Create a Config field with options for transferring data between repos.
187 The allowed options for the field are exactly those supported by
188 `lsst.daf.butler.Datastore.ingest`.
190 Parameters
191 ----------
192 doc : `str`
193 Documentation for the configuration field.
194 default : `str`, optional
195 Default transfer mode for the field.
197 Returns
198 -------
199 field : `lsst.pex.config.ChoiceField`
200 Configuration field.
201 """
202 return ChoiceField(
203 doc=doc,
204 dtype=str,
205 allowed={
206 "move": "move",
207 "copy": "copy",
208 "auto": "choice will depend on datastore",
209 "direct": "use URI to ingested file directly in datastore",
210 "link": "hard link falling back to symbolic link",
211 "hardlink": "hard link",
212 "symlink": "symbolic (soft) link",
213 "relsymlink": "relative symbolic link",
214 },
215 optional=True,
216 default=default,
217 )
220class RawIngestConfig(Config):
221 """Configuration class for RawIngestTask."""
223 transfer = makeTransferChoiceField()
224 failFast = Field(
225 dtype=bool,
226 default=False,
227 doc="If True, stop ingest as soon as any problem is encountered with any file. "
228 "Otherwise problem files will be skipped and logged and a report issued at completion.",
229 )
232class RawIngestTask(Task):
233 """Driver Task for ingesting raw data into Gen3 Butler repositories.
235 Parameters
236 ----------
237 config : `RawIngestConfig`
238 Configuration for the task.
239 butler : `~lsst.daf.butler.Butler`
240 Writeable butler instance, with ``butler.run`` set to the appropriate
241 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
242 datasets.
243 on_success : `Callable`, optional
244 A callback invoked when all of the raws associated with an exposure
245 are ingested. Will be passed a list of `FileDataset` objects, each
246 containing one or more resolved `DatasetRef` objects. If this callback
247 raises it will interrupt the entire ingest process, even if
248 `RawIngestConfig.failFast` is `False`.
249 on_metadata_failure : `Callable`, optional
250 A callback invoked when a failure occurs trying to translate the
251 metadata for a file. Will be passed the URI and the exception, in
252 that order, as positional arguments. Guaranteed to be called in an
253 ``except`` block, allowing the callback to re-raise or replace (with
254 ``raise ... from``) to override the task's usual error handling (before
255 `RawIngestConfig.failFast` logic occurs).
256 on_ingest_failure : `Callable`, optional
257 A callback invoked when dimension record or dataset insertion into the
258 database fails for an exposure. Will be passed a `RawExposureData`
259 instance and the exception, in that order, as positional arguments.
260 Guaranteed to be called in an ``except`` block, allowing the callback
261 to re-raise or replace (with ``raise ... from``) to override the task's
262 usual error handling (before `RawIngestConfig.failFast` logic occurs).
263 **kwargs
264 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
265 constructor.
267 Notes
268 -----
269 Each instance of `RawIngestTask` writes to the same Butler. Each
270 invocation of `RawIngestTask.run` ingests a list of files.
271 """
273 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig
275 _DefaultName: ClassVar[str] = "ingest"
277 def getDatasetType(self) -> DatasetType:
278 """Return the DatasetType of the datasets ingested by this Task."""
279 return DatasetType(
280 "raw",
281 ("instrument", "detector", "exposure"),
282 "Exposure",
283 universe=self.butler.registry.dimensions,
284 )
286 def __init__(
287 self,
288 config: RawIngestConfig,
289 *,
290 butler: Butler,
291 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
292 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing,
293 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
294 **kwargs: Any,
295 ):
296 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
297 super().__init__(config, **kwargs)
298 self.butler = butler
299 self.universe = self.butler.registry.dimensions
300 self.datasetType = self.getDatasetType()
301 self._on_success = on_success
302 self._on_metadata_failure = on_metadata_failure
303 self._on_ingest_failure = on_ingest_failure
304 self.progress = Progress("obs.base.RawIngestTask")
306 # Import all the instrument classes so that we ensure that we
307 # have all the relevant metadata translators loaded.
308 Instrument.importAll(self.butler.registry)
310 def _reduce_kwargs(self) -> Dict[str, Any]:
311 # Add extra parameters to pickle.
312 return dict(
313 **super()._reduce_kwargs(),
314 butler=self.butler,
315 on_success=self._on_success,
316 on_metadata_failure=self._on_metadata_failure,
317 on_ingest_failure=self._on_ingest_failure,
318 )
320 def _determine_instrument_formatter(
321 self, dataId: DataCoordinate, filename: ResourcePath
322 ) -> Tuple[Optional[Instrument], Type[Formatter]]:
323 """Determine the instrument and formatter class.
325 Parameters
326 ----------
327 dataId : `lsst.daf.butler.DataCoordinate`
328 The dataId associated with this dataset.
329 filename : `lsst.resources.ResourcePath`
330 URI of file used for error reporting.
332 Returns
333 -------
334 instrument : `Instrument` or `None`
335 Instance of the `Instrument` associated with this dataset. `None`
336 indicates that the instrument could not be determined.
337 formatterClass : `type`
338 Class to be used as the formatter for this dataset.
339 """
340 # The data model currently assumes that whilst multiple datasets
341 # can be associated with a single file, they must all share the
342 # same formatter.
343 try:
344 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore
345 except LookupError as e:
346 self._on_metadata_failure(filename, e)
347 self.log.warning(
348 "Instrument %s for file %s not known to registry", dataId["instrument"], filename
349 )
350 if self.config.failFast:
351 raise RuntimeError(
352 f"Instrument {dataId['instrument']} for file {filename} not known to registry"
353 ) from e
354 FormatterClass = Formatter
355 # Indicate that we could not work out the instrument.
356 instrument = None
357 else:
358 assert instrument is not None, "Should be guaranted by fromName succeeding."
359 FormatterClass = instrument.getRawFormatter(dataId)
360 return instrument, FormatterClass
362 def extractMetadata(self, filename: ResourcePath) -> RawFileData:
363 """Extract and process metadata from a single raw file.
365 Parameters
366 ----------
367 filename : `lsst.resources.ResourcePath`
368 URI to the file.
370 Returns
371 -------
372 data : `RawFileData`
373 A structure containing the metadata extracted from the file,
374 as well as the original filename. All fields will be populated,
375 but the `RawFileData.dataId` attribute will be a minimal
376 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
377 ``instrument`` field will be `None` if there is a problem
378 with metadata extraction.
380 Notes
381 -----
382 Assumes that there is a single dataset associated with the given
383 file. Instruments using a single file to store multiple datasets
384 must implement their own version of this method.
386 By default the method will catch all exceptions unless the ``failFast``
387 configuration item is `True`. If an error is encountered the
388 `_on_metadata_failure()` method will be called. If no exceptions
389 result and an error was encountered the returned object will have
390 a null-instrument class and no datasets.
392 This method supports sidecar JSON files which can be used to
393 extract metadata without having to read the data file itself.
394 The sidecar file is always used if found.
395 """
396 sidecar_fail_msg = "" # Requires prepended space when set.
397 try:
398 sidecar_file = filename.updatedExtension(".json")
399 if sidecar_file.exists():
400 content = json.loads(sidecar_file.read())
401 headers = [process_sidecar_data(content)]
402 sidecar_fail_msg = " (via sidecar)"
403 else:
404 # Read the metadata from the data file itself.
406 # For remote files download the entire file to get the
407 # header. This is very inefficient and it would be better
408 # to have some way of knowing where in the file the headers
409 # are and to only download those parts of the file.
410 with filename.as_local() as local_file:
411 # Read the primary. This might be sufficient.
412 header = readMetadata(local_file.ospath, 0)
414 try:
415 # Try to work out a translator class early.
416 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
417 except ValueError:
418 # Primary header was not sufficient (maybe this file
419 # has been compressed or is a MEF with minimal
420 # primary). Read second header and merge with primary.
421 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
423 # Try again to work out a translator class, letting this
424 # fail.
425 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
427 # Request the headers to use for ingest
428 headers = translator_class.determine_translatable_headers(filename.ospath, header)
430 # Add each header to the dataset list
431 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
433 except Exception as e:
434 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
435 # Indicate to the caller that we failed to read.
436 datasets = []
437 formatterClass = Formatter
438 instrument = None
439 self._on_metadata_failure(filename, e)
440 if self.config.failFast:
441 raise RuntimeError(
442 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}"
443 ) from e
444 else:
445 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
446 # The data model currently assumes that whilst multiple datasets
447 # can be associated with a single file, they must all share the
448 # same formatter.
449 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
450 if instrument is None:
451 datasets = []
453 return RawFileData(
454 datasets=datasets,
455 filename=filename,
456 # MyPy wants this to be a non-abstract class, which is not true
457 # for the error case where instrument is None and datasets=[].
458 FormatterClass=formatterClass, # type: ignore
459 instrument=instrument,
460 )
462 def _calculate_dataset_info(
463 self, header: Union[Mapping[str, Any], ObservationInfo], filename: ResourcePath
464 ) -> RawFileDatasetInfo:
465 """Calculate a RawFileDatasetInfo from the supplied information.
467 Parameters
468 ----------
469 header : Mapping or `astro_metadata_translator.ObservationInfo`
470 Header from the dataset or previously-translated content.
471 filename : `lsst.resources.ResourcePath`
472 Filename to use for error messages.
474 Returns
475 -------
476 dataset : `RawFileDatasetInfo`
477 The dataId, and observation information associated with this
478 dataset.
479 """
480 # To ensure we aren't slowed down for no reason, explicitly
481 # list here the properties we need for the schema.
482 # Use a dict with values a boolean where True indicates
483 # that it is required that we calculate this property.
484 ingest_subset = {
485 "altaz_begin": False,
486 "boresight_rotation_coord": False,
487 "boresight_rotation_angle": False,
488 "dark_time": False,
489 "datetime_begin": True,
490 "datetime_end": True,
491 "detector_num": True,
492 "exposure_group": False,
493 "exposure_id": True,
494 "exposure_time": True,
495 "instrument": True,
496 "tracking_radec": False,
497 "object": False,
498 "observation_counter": False,
499 "observation_id": True,
500 "observation_reason": False,
501 "observation_type": True,
502 "observing_day": False,
503 "physical_filter": True,
504 "science_program": False,
505 "visit_id": False,
506 }
508 if isinstance(header, ObservationInfo):
509 obsInfo = header
510 missing = []
511 # Need to check the required properties are present.
512 for property, required in ingest_subset.items():
513 if not required:
514 continue
515 # getattr does not need to be protected because it is using
516 # the defined list above containing properties that must exist.
517 value = getattr(obsInfo, property)
518 if value is None:
519 missing.append(property)
520 if missing:
521 raise ValueError(
522 f"Requested required properties are missing from file {filename}:"
523 f" {missing} (via JSON)"
524 )
526 else:
527 obsInfo = ObservationInfo(
528 header,
529 pedantic=False,
530 filename=str(filename),
531 required={k for k in ingest_subset if ingest_subset[k]},
532 subset=set(ingest_subset),
533 )
535 dataId = DataCoordinate.standardize(
536 instrument=obsInfo.instrument,
537 exposure=obsInfo.exposure_id,
538 detector=obsInfo.detector_num,
539 universe=self.universe,
540 )
541 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
543 def locateAndReadIndexFiles(
544 self, files: Iterable[ResourcePath]
545 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]:
546 """Given a list of files, look for index files and read them.
548 Index files can either be explicitly in the list of files to
549 ingest, or else located in the same directory as a file to ingest.
550 Index entries are always used if present.
552 Parameters
553 ----------
554 files : iterable over `lsst.resources.ResourcePath`
555 URIs to the files to be ingested.
557 Returns
558 -------
559 index : `dict` [`ResourcePath`, Any]
560 Merged contents of all relevant index files found. These can
561 be explicitly specified index files or ones found in the
562 directory alongside a data file to be ingested.
563 updated_files : `list` of `ResourcePath`
564 Updated list of the input files with entries removed that were
565 found listed in an index file. Order is not guaranteed to
566 match the order of the files given to this routine.
567 good_index_files: `set` [ `ResourcePath` ]
568 Index files that were successfully read.
569 bad_index_files: `set` [ `ResourcePath` ]
570 Files that looked like index files but failed to read properly.
571 """
572 # Convert the paths to absolute for easy comparison with index content.
573 # Do not convert to real paths since we have to assume that index
574 # files are in this location and not the location which it links to.
575 files = tuple(f.abspath() for f in files)
577 # Index files must be named this.
578 index_root_file = "_index.json"
580 # Group the files by directory.
581 files_by_directory = defaultdict(set)
583 for path in files:
584 directory, file_in_dir = path.split()
585 files_by_directory[directory].add(file_in_dir)
587 # All the metadata read from index files with keys of full path.
588 index_entries: Dict[ResourcePath, Any] = {}
590 # Index files we failed to read.
591 bad_index_files = set()
593 # Any good index files that were found and used.
594 good_index_files = set()
596 # Look for index files in those directories.
597 for directory, files_in_directory in files_by_directory.items():
598 possible_index_file = directory.join(index_root_file)
599 if possible_index_file.exists():
600 # If we are explicitly requesting an index file the
601 # messages should be different.
602 index_msg = "inferred"
603 is_implied = True
604 if index_root_file in files_in_directory:
605 index_msg = "explicit"
606 is_implied = False
608 # Try to read the index file and catch and report any
609 # problems.
610 try:
611 content = json.loads(possible_index_file.read())
612 index = process_index_data(content, force_dict=True)
613 except Exception as e:
614 # Only trigger the callback if the index file
615 # was asked for explicitly. Triggering on implied file
616 # might be surprising.
617 if not is_implied:
618 self._on_metadata_failure(possible_index_file, e)
619 if self.config.failFast:
620 raise RuntimeError(
621 f"Problem reading index file from {index_msg} location {possible_index_file}"
622 ) from e
623 bad_index_files.add(possible_index_file)
624 continue
626 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
627 good_index_files.add(possible_index_file)
629 # Go through the index adding entries for files.
630 # If we have non-index files in this directory marked for
631 # ingest we should only get index information for those.
632 # If the index file was explicit we use all entries.
633 if is_implied:
634 files_to_ingest = files_in_directory
635 else:
636 files_to_ingest = set(index)
638 # Copy relevant metadata into a single dict for all index
639 # entries.
640 for file_in_dir in files_to_ingest:
641 # Skip an explicitly specified index file.
642 # This should never happen because an explicit index
643 # file will force ingest of all files in the index
644 # and not use the explicit file list. If somehow
645 # this is not true we continue. Raising an exception
646 # seems like the wrong thing to do since this is harmless.
647 if file_in_dir == index_root_file:
648 self.log.info(
649 "Logic error found scanning directory %s. Please file ticket.", directory
650 )
651 continue
652 if file_in_dir in index:
653 file = directory.join(file_in_dir)
654 if file in index_entries:
655 # ObservationInfo overrides raw metadata
656 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance(
657 index_entries[file], ObservationInfo
658 ):
659 self.log.warning(
660 "File %s already specified in an index file but overriding"
661 " with ObservationInfo content from %s",
662 file,
663 possible_index_file,
664 )
665 else:
666 self.log.warning(
667 "File %s already specified in an index file, ignoring content from %s",
668 file,
669 possible_index_file,
670 )
671 # Do nothing in this case
672 continue
674 index_entries[file] = index[file_in_dir]
676 # Remove files from list that have index entries and also
677 # any files that we determined to be explicit index files
678 # or any index files that we failed to read.
679 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
681 # The filtered list loses the initial order. Retaining the order
682 # is good for testing but does have a cost if there are many
683 # files when copying the good values out. A dict would have faster
684 # lookups (using the files as keys) but use more memory.
685 ordered = [f for f in filtered if f in files]
687 return index_entries, ordered, good_index_files, bad_index_files
689 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]:
690 """Convert index entries to RawFileData.
692 Parameters
693 ----------
694 index_entries : `dict` [`ResourcePath`, Any]
695 Dict indexed by name of file to ingest and with keys either
696 raw metadata or translated
697 `~astro_metadata_translator.ObservationInfo`.
699 Returns
700 -------
701 data : `list` [ `RawFileData` ]
702 Structures containing the metadata extracted from the file,
703 as well as the original filename. All fields will be populated,
704 but the `RawFileData.dataId` attributes will be minimal
705 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances.
706 """
707 fileData = []
708 for filename, metadata in index_entries.items():
709 try:
710 datasets = [self._calculate_dataset_info(metadata, filename)]
711 except Exception as e:
712 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e)
713 datasets = []
714 formatterClass = Formatter
715 instrument = None
716 self._on_metadata_failure(filename, e)
717 if self.config.failFast:
718 raise RuntimeError(
719 f"Problem extracting metadata for file {filename} found in index file"
720 ) from e
721 else:
722 instrument, formatterClass = self._determine_instrument_formatter(
723 datasets[0].dataId, filename
724 )
725 if instrument is None:
726 datasets = []
727 fileData.append(
728 RawFileData(
729 datasets=datasets,
730 filename=filename,
731 # MyPy wants this to be a non-abstract class, which is not
732 # true for the error case where instrument is None and
733 # datasets=[].
734 FormatterClass=formatterClass, # type: ignore
735 instrument=instrument,
736 )
737 )
738 return fileData
740 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
741 """Group an iterable of `RawFileData` by exposure.
743 Parameters
744 ----------
745 files : iterable of `RawFileData`
746 File-level information to group.
748 Returns
749 -------
750 exposures : `list` of `RawExposureData`
751 A list of structures that group the file-level information by
752 exposure. All fields will be populated. The
753 `RawExposureData.dataId` attributes will be minimal (unexpanded)
754 `~lsst.daf.butler.DataCoordinate` instances.
755 """
756 exposureDimensions = self.universe["exposure"].graph
757 byExposure = defaultdict(list)
758 for f in files:
759 # Assume that the first dataset is representative for the file.
760 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
762 return [
763 RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
764 for dataId, exposureFiles in byExposure.items()
765 ]
767 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
768 """Expand the data IDs associated with a raw exposure.
770 This adds the metadata records.
772 Parameters
773 ----------
774 exposure : `RawExposureData`
775 A structure containing information about the exposure to be
776 ingested. Must have `RawExposureData.records` populated. Should
777 be considered consumed upon return.
779 Returns
780 -------
781 exposure : `RawExposureData`
782 An updated version of the input structure, with
783 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
784 updated to data IDs for which
785 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
786 """
787 # We start by expanded the exposure-level data ID; we won't use that
788 # directly in file ingest, but this lets us do some database lookups
789 # once per exposure instead of once per file later.
790 data.dataId = self.butler.registry.expandDataId(
791 data.dataId,
792 # We pass in the records we'll be inserting shortly so they aren't
793 # looked up from the database. We do expect instrument and filter
794 # records to be retrieved from the database here (though the
795 # Registry may cache them so there isn't a lookup every time).
796 records={"exposure": data.record},
797 )
798 # Now we expand the per-file (exposure+detector) data IDs. This time
799 # we pass in the records we just retrieved from the exposure data ID
800 # expansion.
801 for file in data.files:
802 for dataset in file.datasets:
803 dataset.dataId = self.butler.registry.expandDataId(
804 dataset.dataId, records=data.dataId.records
805 )
806 return data
808 def prep(
809 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None, processes: int = 1
810 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]:
811 """Perform all non-database-updating ingest preprocessing steps.
813 Parameters
814 ----------
815 files : iterable over `str` or path-like objects
816 Paths to the files to be ingested. Will be made absolute
817 if they are not already.
818 pool : `multiprocessing.Pool`, optional
819 If not `None`, a process pool with which to parallelize some
820 operations.
821 processes : `int`, optional
822 The number of processes to use. Ignored if ``pool`` is not `None`.
824 Returns
825 -------
826 exposures : `Iterator` [ `RawExposureData` ]
827 Data structures containing dimension records, filenames, and data
828 IDs to be ingested (one structure for each exposure).
829 bad_files : `list` of `str`
830 List of all the files that could not have metadata extracted.
831 """
832 if pool is None and processes > 1:
833 pool = Pool(processes)
834 mapFunc = map if pool is None else pool.imap_unordered
836 def _partition_good_bad(
837 file_data: Iterable[RawFileData],
838 ) -> Tuple[List[RawFileData], List[ResourcePath]]:
839 """Filter out bad files and return good with list of bad."""
840 good_files = []
841 bad_files = []
842 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"):
843 if not fileDatum.datasets:
844 bad_files.append(fileDatum.filename)
845 else:
846 good_files.append(fileDatum)
847 return good_files, bad_files
849 # Look for index files and read them.
850 # There should be far fewer index files than data files.
851 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
852 if bad_index_files:
853 self.log.info("Failed to read the following explicitly requested index files:")
854 for bad in sorted(bad_index_files):
855 self.log.info("- %s", bad)
857 # Now convert all the index file entries to standard form for ingest.
858 processed_bad_index_files: List[ResourcePath] = []
859 indexFileData = self.processIndexEntries(index_entries)
860 if indexFileData:
861 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData)
862 self.log.info(
863 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s",
864 *_log_msg_counter(indexFileData),
865 *_log_msg_counter(good_index_files),
866 *_log_msg_counter(processed_bad_index_files),
867 )
869 # Extract metadata and build per-detector regions.
870 # This could run in a subprocess so collect all output
871 # before looking at failures.
872 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
874 # Filter out all the failed reads and store them for later
875 # reporting.
876 good_file_data, bad_files = _partition_good_bad(fileData)
877 self.log.info(
878 "Successfully extracted metadata from %d file%s with %d failure%s",
879 *_log_msg_counter(good_file_data),
880 *_log_msg_counter(bad_files),
881 )
883 # Combine with data from index files.
884 good_file_data.extend(indexFileData)
885 bad_files.extend(processed_bad_index_files)
886 bad_files.extend(bad_index_files)
888 # Use that metadata to group files (and extracted metadata) by
889 # exposure. Never parallelized because it's intrinsically a gather
890 # step.
891 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data)
893 # The next operation operates on RawExposureData instances (one at
894 # a time) in-place and then returns the modified instance. We call it
895 # as a pass-through instead of relying on the arguments we pass in to
896 # have been modified because in the parallel case those arguments are
897 # going to be pickled and unpickled, and I'm not certain
898 # multiprocessing is careful enough with that for output arguments to
899 # work.
901 # Expand the data IDs to include all dimension metadata; we need this
902 # because we may need to generate path templates that rely on that
903 # metadata.
904 # This is the first step that involves actual database calls (but just
905 # SELECTs), so if there's going to be a problem with connections vs.
906 # multiple processes, or lock contention (in SQLite) slowing things
907 # down, it'll happen here.
908 return mapFunc(self.expandDataIds, exposureData), bad_files
910 def ingestExposureDatasets(
911 self,
912 exposure: RawExposureData,
913 *,
914 run: Optional[str] = None,
915 skip_existing_exposures: bool = False,
916 track_file_attrs: bool = True,
917 ) -> List[FileDataset]:
918 """Ingest all raw files in one exposure.
920 Parameters
921 ----------
922 exposure : `RawExposureData`
923 A structure containing information about the exposure to be
924 ingested. Must have `RawExposureData.records` populated and all
925 data ID attributes expanded.
926 run : `str`, optional
927 Name of a RUN-type collection to write to, overriding
928 ``self.butler.run``.
929 skip_existing_exposures : `bool`, optional
930 If `True` (`False` is default), skip raws that have already been
931 ingested (i.e. raws for which we already have a dataset with the
932 same data ID in the target collection, even if from another file).
933 Note that this is much slower than just not passing
934 already-ingested files as inputs, because we still need to read and
935 process metadata to identify which exposures to search for. It
936 also will not work reliably if multiple processes are attempting to
937 ingest raws from the same exposure concurrently, in that different
938 processes may still attempt to ingest the same raw and conflict,
939 causing a failure that prevents other raws from the same exposure
940 from being ingested.
941 track_file_attrs : `bool`, optional
942 Control whether file attributes such as the size or checksum should
943 be tracked by the datastore. Whether this parameter is honored
944 depends on the specific datastore implentation.
946 Returns
947 -------
948 datasets : `list` of `lsst.daf.butler.FileDataset`
949 Per-file structures identifying the files ingested and their
950 dataset representation in the data repository.
951 """
952 if skip_existing_exposures:
953 existing = {
954 ref.dataId
955 for ref in self.butler.registry.queryDatasets(
956 self.datasetType,
957 collections=[run],
958 dataId=exposure.dataId,
959 )
960 }
961 else:
962 existing = set()
963 datasets = []
964 for file in exposure.files:
965 refs = [DatasetRef(self.datasetType, d.dataId) for d in file.datasets if d.dataId not in existing]
966 if refs:
967 datasets.append(
968 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass)
969 )
971 # Raw files are preferentially ingested using a UUID derived from
972 # the collection name and dataId.
973 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN):
974 mode = DatasetIdGenEnum.DATAID_TYPE_RUN
975 else:
976 mode = DatasetIdGenEnum.UNIQUE
977 self.butler.ingest(
978 *datasets,
979 transfer=self.config.transfer,
980 run=run,
981 idGenerationMode=mode,
982 record_validation_info=track_file_attrs,
983 )
984 return datasets
986 def ingestFiles(
987 self,
988 files: Iterable[ResourcePath],
989 *,
990 pool: Optional[PoolType] = None,
991 processes: int = 1,
992 run: Optional[str] = None,
993 skip_existing_exposures: bool = False,
994 update_exposure_records: bool = False,
995 track_file_attrs: bool = True,
996 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]:
997 """Ingest files into a Butler data repository.
999 This creates any new exposure or visit Dimension entries needed to
1000 identify the ingested files, creates new Dataset entries in the
1001 Registry and finally ingests the files themselves into the Datastore.
1002 Any needed instrument, detector, and physical_filter Dimension entries
1003 must exist in the Registry before `run` is called.
1005 Parameters
1006 ----------
1007 files : iterable over `lsst.resources.ResourcePath`
1008 URIs to the files to be ingested.
1009 pool : `multiprocessing.Pool`, optional
1010 If not `None`, a process pool with which to parallelize some
1011 operations.
1012 processes : `int`, optional
1013 The number of processes to use. Ignored if ``pool`` is not `None`.
1014 run : `str`, optional
1015 Name of a RUN-type collection to write to, overriding
1016 the default derived from the instrument name.
1017 skip_existing_exposures : `bool`, optional
1018 If `True` (`False` is default), skip raws that have already been
1019 ingested (i.e. raws for which we already have a dataset with the
1020 same data ID in the target collection, even if from another file).
1021 Note that this is much slower than just not passing
1022 already-ingested files as inputs, because we still need to read and
1023 process metadata to identify which exposures to search for. It
1024 also will not work reliably if multiple processes are attempting to
1025 ingest raws from the same exposure concurrently, in that different
1026 processes may still attempt to ingest the same raw and conflict,
1027 causing a failure that prevents other raws from the same exposure
1028 from being ingested.
1029 update_exposure_records : `bool`, optional
1030 If `True` (`False` is default), update existing exposure records
1031 that conflict with the new ones instead of rejecting them. THIS IS
1032 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1033 KNOWN TO BE BAD. This should usually be combined with
1034 ``skip_existing_exposures=True``.
1035 track_file_attrs : `bool`, optional
1036 Control whether file attributes such as the size or checksum should
1037 be tracked by the datastore. Whether this parameter is honored
1038 depends on the specific datastore implentation.
1040 Returns
1041 -------
1042 refs : `list` of `lsst.daf.butler.DatasetRef`
1043 Dataset references for ingested raws.
1044 bad_files : `list` of `ResourcePath`
1045 Given paths that could not be ingested.
1046 n_exposures : `int`
1047 Number of exposures successfully ingested.
1048 n_exposures_failed : `int`
1049 Number of exposures that failed when inserting dimension data.
1050 n_ingests_failed : `int`
1051 Number of exposures that failed when ingesting raw datasets.
1052 """
1054 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
1056 # Up to this point, we haven't modified the data repository at all.
1057 # Now we finally do that, with one transaction per exposure. This is
1058 # not parallelized at present because the performance of this step is
1059 # limited by the database server. That may or may not change in the
1060 # future once we increase our usage of bulk inserts and reduce our
1061 # usage of savepoints; we've tried to get everything but the database
1062 # operations done in advance to reduce the time spent inside
1063 # transactions.
1064 self.butler.registry.registerDatasetType(self.datasetType)
1066 refs = []
1067 runs = set()
1068 n_exposures = 0
1069 n_exposures_failed = 0
1070 n_ingests_failed = 0
1071 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
1072 assert exposure.record is not None, "Should be guaranteed by prep()"
1073 self.log.debug(
1074 "Attempting to ingest %d file%s from exposure %s:%s",
1075 *_log_msg_counter(exposure.files),
1076 exposure.record.instrument,
1077 exposure.record.obs_id,
1078 )
1080 try:
1081 inserted_or_updated = self.butler.registry.syncDimensionData(
1082 "exposure",
1083 exposure.record,
1084 update=update_exposure_records,
1085 )
1086 except Exception as e:
1087 self._on_ingest_failure(exposure, e)
1088 n_exposures_failed += 1
1089 self.log.warning(
1090 "Exposure %s:%s could not be registered: %s",
1091 exposure.record.instrument,
1092 exposure.record.obs_id,
1093 e,
1094 )
1095 if self.config.failFast:
1096 raise e
1097 continue
1099 if isinstance(inserted_or_updated, dict):
1100 # Exposure is in the registry and we updated it, so
1101 # syncDimensionData returned a dict.
1102 self.log.info(
1103 "Exposure %s:%s was already present, but columns %s were updated.",
1104 exposure.record.instrument,
1105 exposure.record.obs_id,
1106 str(list(inserted_or_updated.keys())),
1107 )
1109 # Override default run if nothing specified explicitly.
1110 if run is None:
1111 instrument = exposure.files[0].instrument
1112 assert (
1113 instrument is not None
1114 ), "file should have been removed from this list by prep if instrument could not be found"
1115 this_run = instrument.makeDefaultRawIngestRunName()
1116 else:
1117 this_run = run
1118 if this_run not in runs:
1119 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
1120 runs.add(this_run)
1121 try:
1122 datasets_for_exposure = self.ingestExposureDatasets(
1123 exposure,
1124 run=this_run,
1125 skip_existing_exposures=skip_existing_exposures,
1126 track_file_attrs=track_file_attrs,
1127 )
1128 except Exception as e:
1129 self._on_ingest_failure(exposure, e)
1130 n_ingests_failed += 1
1131 self.log.warning("Failed to ingest the following for reason: %s", e)
1132 for f in exposure.files:
1133 self.log.warning("- %s", f.filename)
1134 if self.config.failFast:
1135 raise e
1136 continue
1137 else:
1138 self._on_success(datasets_for_exposure)
1139 for dataset in datasets_for_exposure:
1140 refs.extend(dataset.refs)
1142 # Success for this exposure.
1143 n_exposures += 1
1144 self.log.info(
1145 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id
1146 )
1148 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
1150 @timeMethod
1151 def run(
1152 self,
1153 files: Iterable[ResourcePathExpression],
1154 *,
1155 pool: Optional[PoolType] = None,
1156 processes: int = 1,
1157 run: Optional[str] = None,
1158 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b",
1159 group_files: bool = True,
1160 skip_existing_exposures: bool = False,
1161 update_exposure_records: bool = False,
1162 track_file_attrs: bool = True,
1163 ) -> List[DatasetRef]:
1164 """Ingest files into a Butler data repository.
1166 This creates any new exposure or visit Dimension entries needed to
1167 identify the ingested files, creates new Dataset entries in the
1168 Registry and finally ingests the files themselves into the Datastore.
1169 Any needed instrument, detector, and physical_filter Dimension entries
1170 must exist in the Registry before `run` is called.
1172 Parameters
1173 ----------
1174 files : iterable `lsst.resources.ResourcePath`, `str` or path-like
1175 Paths to the files to be ingested. Can refer to directories.
1176 Will be made absolute if they are not already.
1177 pool : `multiprocessing.Pool`, optional
1178 If not `None`, a process pool with which to parallelize some
1179 operations.
1180 processes : `int`, optional
1181 The number of processes to use. Ignored if ``pool`` is not `None`.
1182 run : `str`, optional
1183 Name of a RUN-type collection to write to, overriding
1184 the default derived from the instrument name.
1185 file_filter : `str` or `re.Pattern`, optional
1186 Pattern to use to discover files to ingest within directories.
1187 The default is to search for FITS files. The regex applies to
1188 files within the directory.
1189 group_files : `bool`, optional
1190 Group files by directory if they have been discovered in
1191 directories. Will not affect files explicitly provided.
1192 skip_existing_exposures : `bool`, optional
1193 If `True` (`False` is default), skip raws that have already been
1194 ingested (i.e. raws for which we already have a dataset with the
1195 same data ID in the target collection, even if from another file).
1196 Note that this is much slower than just not passing
1197 already-ingested files as inputs, because we still need to read and
1198 process metadata to identify which exposures to search for. It
1199 also will not work reliably if multiple processes are attempting to
1200 ingest raws from the same exposure concurrently, in that different
1201 processes may still attempt to ingest the same raw and conflict,
1202 causing a failure that prevents other raws from the same exposure
1203 from being ingested.
1204 update_exposure_records : `bool`, optional
1205 If `True` (`False` is default), update existing exposure records
1206 that conflict with the new ones instead of rejecting them. THIS IS
1207 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1208 KNOWN TO BE BAD. This should usually be combined with
1209 ``skip_existing_exposures=True``.
1210 track_file_attrs : `bool`, optional
1211 Control whether file attributes such as the size or checksum should
1212 be tracked by the datastore. Whether this parameter is honored
1213 depends on the specific datastore implentation.
1215 Returns
1216 -------
1217 refs : `list` of `lsst.daf.butler.DatasetRef`
1218 Dataset references for ingested raws.
1220 Notes
1221 -----
1222 This method inserts all datasets for an exposure within a transaction,
1223 guaranteeing that partial exposures are never ingested. The exposure
1224 dimension record is inserted with `Registry.syncDimensionData` first
1225 (in its own transaction), which inserts only if a record with the same
1226 primary key does not already exist. This allows different files within
1227 the same exposure to be ingested in different runs.
1228 """
1230 refs = []
1231 bad_files = []
1232 n_exposures = 0
1233 n_exposures_failed = 0
1234 n_ingests_failed = 0
1235 if group_files:
1236 for group in ResourcePath.findFileResources(files, file_filter, group_files):
1237 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1238 group,
1239 pool=pool,
1240 processes=processes,
1241 run=run,
1242 skip_existing_exposures=skip_existing_exposures,
1243 update_exposure_records=update_exposure_records,
1244 track_file_attrs=track_file_attrs,
1245 )
1246 refs.extend(new_refs)
1247 bad_files.extend(bad)
1248 n_exposures += n_exp
1249 n_exposures_failed += n_exp_fail
1250 n_ingests_failed += n_ingest_fail
1251 else:
1252 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1253 ResourcePath.findFileResources(files, file_filter, group_files),
1254 pool=pool,
1255 processes=processes,
1256 run=run,
1257 skip_existing_exposures=skip_existing_exposures,
1258 update_exposure_records=update_exposure_records,
1259 )
1261 had_failure = False
1263 if bad_files:
1264 had_failure = True
1265 self.log.warning("Could not extract observation metadata from the following:")
1266 for f in bad_files:
1267 self.log.warning("- %s", f)
1269 self.log.info(
1270 "Successfully processed data from %d exposure%s with %d failure%s from exposure"
1271 " registration and %d failure%s from file ingest.",
1272 *_log_msg_counter(n_exposures),
1273 *_log_msg_counter(n_exposures_failed),
1274 *_log_msg_counter(n_ingests_failed),
1275 )
1276 if n_exposures_failed > 0 or n_ingests_failed > 0:
1277 had_failure = True
1278 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1280 if had_failure:
1281 raise RuntimeError("Some failures encountered during ingestion")
1283 return refs