Coverage for python/lsst/obs/base/ingest.py : 15%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from dataclasses import dataclass, InitVar
28from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union
29from collections import defaultdict
30from multiprocessing import Pool
32from astro_metadata_translator import ObservationInfo, merge_headers, MetadataTranslator
33from astro_metadata_translator.indexing import process_sidecar_data, process_index_data
34from lsst.afw.fits import readMetadata
35from lsst.daf.butler import (
36 Butler,
37 ButlerURI,
38 CollectionType,
39 DataCoordinate,
40 DatasetIdGenEnum,
41 DatasetRef,
42 DatasetType,
43 DimensionRecord,
44 DimensionUniverse,
45 FileDataset,
46 Formatter,
47 Progress,
48)
49from lsst.daf.butler.registry import UnsupportedIdGeneratorError
50from lsst.pex.config import Config, ChoiceField, Field
51from lsst.pipe.base import Task, timeMethod
53from ._instrument import Instrument, makeExposureRecordFromObsInfo
54from ._fitsRawFormatterBase import FitsRawFormatterBase
57def _do_nothing(*args, **kwargs) -> None:
58 """Do nothing.
60 This is a function that accepts anything and does nothing.
61 For use as a default in callback arguments.
62 """
63 pass
66def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]:
67 """Count the iterable and return the count and plural modifier.
69 Parameters
70 ----------
71 noun : Iterable or `int`
72 Thing to count. If given an integer it is assumed to be the count
73 to use to calculate modifier.
75 Returns
76 -------
77 num : `int`
78 Number of items found in ``noun``.
79 modifier : `str`
80 Character to add to the end of a string referring to these items
81 to indicate whether it was a single item or not. Returns empty
82 string if there is one item or "s" otherwise.
84 Examples
85 --------
87 .. code-block:: python
89 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
90 """
91 if isinstance(noun, int):
92 num = noun
93 else:
94 num = len(noun)
95 return num, "" if num == 1 else "s"
98@dataclass
99class RawFileDatasetInfo:
100 """Information about a single dataset within a raw file."""
102 dataId: DataCoordinate
103 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
105 obsInfo: ObservationInfo
106 """Standardized observation metadata extracted directly from the file
107 headers (`astro_metadata_translator.ObservationInfo`).
108 """
111@dataclass
112class RawFileData:
113 """Information about a single raw file, used during ingest."""
115 datasets: List[RawFileDatasetInfo]
116 """The information describing each dataset within this raw file.
117 (`list` of `RawFileDatasetInfo`)
118 """
120 filename: ButlerURI
121 """URI of the file this information was extracted from (`str`).
123 This is the path prior to ingest, not the path after ingest.
124 """
126 FormatterClass: Type[FitsRawFormatterBase]
127 """Formatter class that should be used to ingest this file (`type`; as
128 subclass of `FitsRawFormatterBase`).
129 """
131 instrument: Optional[Instrument]
132 """The `Instrument` instance associated with this file. Can be `None`
133 if ``datasets`` is an empty list."""
136@dataclass
137class RawExposureData:
138 """Information about a complete raw exposure, used during ingest."""
140 dataId: DataCoordinate
141 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
142 """
144 files: List[RawFileData]
145 """List of structures containing file-level information.
146 """
148 universe: InitVar[DimensionUniverse]
149 """Set of all known dimensions.
150 """
152 record: Optional[DimensionRecord] = None
153 """The exposure `DimensionRecord` that must be inserted into the
154 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
155 """
157 def __post_init__(self, universe: DimensionUniverse):
158 # We don't care which file or dataset we read metadata from, because
159 # we're assuming they'll all be the same; just use the first ones.
160 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
163def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
164 """Create a Config field with options for transferring data between repos.
166 The allowed options for the field are exactly those supported by
167 `lsst.daf.butler.Datastore.ingest`.
169 Parameters
170 ----------
171 doc : `str`
172 Documentation for the configuration field.
174 Returns
175 -------
176 field : `lsst.pex.config.ChoiceField`
177 Configuration field.
178 """
179 return ChoiceField(
180 doc=doc,
181 dtype=str,
182 allowed={"move": "move",
183 "copy": "copy",
184 "auto": "choice will depend on datastore",
185 "direct": "use URI to ingested file directly in datastore",
186 "link": "hard link falling back to symbolic link",
187 "hardlink": "hard link",
188 "symlink": "symbolic (soft) link",
189 "relsymlink": "relative symbolic link",
190 },
191 optional=True,
192 default=default
193 )
196class RawIngestConfig(Config):
197 """Configuration class for RawIngestTask."""
199 transfer = makeTransferChoiceField()
200 failFast = Field(
201 dtype=bool,
202 default=False,
203 doc="If True, stop ingest as soon as any problem is encountered with any file. "
204 "Otherwise problems files will be skipped and logged and a report issued at completion.",
205 )
208class RawIngestTask(Task):
209 """Driver Task for ingesting raw data into Gen3 Butler repositories.
211 Parameters
212 ----------
213 config : `RawIngestConfig`
214 Configuration for the task.
215 butler : `~lsst.daf.butler.Butler`
216 Writeable butler instance, with ``butler.run`` set to the appropriate
217 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
218 datasets.
219 on_success : `Callable`, optional
220 A callback invoked when all of the raws associated with an exposure
221 are ingested. Will be passed a list of `FileDataset` objects, each
222 containing one or more resolved `DatasetRef` objects. If this callback
223 raises it will interrupt the entire ingest process, even if
224 `RawIngestConfig.failFast` is `False`.
225 on_metadata_failure : `Callable`, optional
226 A callback invoked when a failure occurs trying to translate the
227 metadata for a file. Will be passed the URI and the exception, in
228 that order, as positional arguments. Guaranteed to be called in an
229 ``except`` block, allowing the callback to re-raise or replace (with
230 ``raise ... from``) to override the task's usual error handling (before
231 `RawIngestConfig.failFast` logic occurs).
232 on_ingest_failure : `Callable`, optional
233 A callback invoked when dimension record or dataset insertion into the
234 database fails for an exposure. Will be passed a `RawExposureData`
235 instance and the exception, in that order, as positional arguments.
236 Guaranteed to be called in an ``except`` block, allowing the callback
237 to re-raise or replace (with ``raise ... from``) to override the task's
238 usual error handling (before `RawIngestConfig.failFast` logic occurs).
239 **kwargs
240 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
241 constructor.
243 Notes
244 -----
245 Each instance of `RawIngestTask` writes to the same Butler. Each
246 invocation of `RawIngestTask.run` ingests a list of files.
247 """
249 ConfigClass = RawIngestConfig
251 _DefaultName = "ingest"
253 def getDatasetType(self):
254 """Return the DatasetType of the datasets ingested by this Task."""
255 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
256 universe=self.butler.registry.dimensions)
258 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler,
259 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
260 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing,
261 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
262 **kwargs: Any):
263 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
264 super().__init__(config, **kwargs)
265 self.butler = butler
266 self.universe = self.butler.registry.dimensions
267 self.datasetType = self.getDatasetType()
268 self._on_success = on_success
269 self._on_metadata_failure = on_metadata_failure
270 self._on_ingest_failure = on_ingest_failure
271 self.progress = Progress("obs.base.RawIngestTask")
273 # Import all the instrument classes so that we ensure that we
274 # have all the relevant metadata translators loaded.
275 Instrument.importAll(self.butler.registry)
277 def _reduce_kwargs(self):
278 # Add extra parameters to pickle.
279 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success,
280 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure)
282 def _determine_instrument_formatter(self, dataId, filename):
283 """Determine the instrument and formatter class.
285 Parameters
286 ----------
287 dataId : `lsst.daf.butler.DataCoordinate`
288 The dataId associated with this dataset.
289 filename : `ButlerURI`
290 URI of file used for error reporting.
292 Returns
293 -------
294 instrument : `Instrument` or `None`
295 Instance of the `Instrument` associated with this dataset. `None`
296 indicates that the instrument could not be determined.
297 formatterClass : `type`
298 Class to be used as the formatter for this dataset.
299 """
300 # The data model currently assumes that whilst multiple datasets
301 # can be associated with a single file, they must all share the
302 # same formatter.
303 try:
304 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry)
305 except LookupError as e:
306 self._on_metadata_failure(filename, e)
307 self.log.warning("Instrument %s for file %s not known to registry",
308 dataId["instrument"], filename)
309 if self.config.failFast:
310 raise RuntimeError(f"Instrument {dataId['instrument']} for"
311 f" file {filename} not known to registry") from e
312 FormatterClass = Formatter
313 # Indicate that we could not work out the instrument.
314 instrument = None
315 else:
316 FormatterClass = instrument.getRawFormatter(dataId)
317 return instrument, FormatterClass
319 def extractMetadata(self, filename: ButlerURI) -> RawFileData:
320 """Extract and process metadata from a single raw file.
322 Parameters
323 ----------
324 filename : `ButlerURI`
325 URI to the file.
327 Returns
328 -------
329 data : `RawFileData`
330 A structure containing the metadata extracted from the file,
331 as well as the original filename. All fields will be populated,
332 but the `RawFileData.dataId` attribute will be a minimal
333 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
334 ``instrument`` field will be `None` if there is a problem
335 with metadata extraction.
337 Notes
338 -----
339 Assumes that there is a single dataset associated with the given
340 file. Instruments using a single file to store multiple datasets
341 must implement their own version of this method.
343 By default the method will catch all exceptions unless the ``failFast``
344 configuration item is `True`. If an error is encountered the
345 `_on_metadata_failure()` method will be called. If no exceptions
346 result and an error was encountered the returned object will have
347 a null-instrument class and no datasets.
349 This method supports sidecar JSON files which can be used to
350 extract metadata without having to read the data file itself.
351 The sidecar file is always used if found.
352 """
353 sidecar_fail_msg = "" # Requires prepended space when set.
354 try:
355 sidecar_file = filename.updatedExtension(".json")
356 if sidecar_file.exists():
357 content = json.loads(sidecar_file.read())
358 headers = [process_sidecar_data(content)]
359 sidecar_fail_msg = " (via sidecar)"
360 else:
361 # Read the metadata from the data file itself.
363 # For remote files download the entire file to get the
364 # header. This is very inefficient and it would be better
365 # to have some way of knowing where in the file the headers
366 # are and to only download those parts of the file.
367 with filename.as_local() as local_file:
368 # Read the primary. This might be sufficient.
369 header = readMetadata(local_file.ospath, 0)
371 try:
372 # Try to work out a translator class early.
373 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
374 except ValueError:
375 # Primary header was not sufficient (maybe this file
376 # has been compressed or is a MEF with minimal
377 # primary). Read second header and merge with primary.
378 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
380 # Try again to work out a translator class, letting this
381 # fail.
382 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
384 # Request the headers to use for ingest
385 headers = translator_class.determine_translatable_headers(filename.ospath, header)
387 # Add each header to the dataset list
388 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
390 except Exception as e:
391 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
392 # Indicate to the caller that we failed to read.
393 datasets = []
394 formatterClass = Formatter
395 instrument = None
396 self._on_metadata_failure(filename, e)
397 if self.config.failFast:
398 raise RuntimeError("Problem extracting metadata for file "
399 f"{filename}{sidecar_fail_msg}") from e
400 else:
401 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
402 # The data model currently assumes that whilst multiple datasets
403 # can be associated with a single file, they must all share the
404 # same formatter.
405 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
406 if instrument is None:
407 datasets = []
409 return RawFileData(datasets=datasets, filename=filename,
410 FormatterClass=formatterClass,
411 instrument=instrument)
413 def _calculate_dataset_info(self, header, filename):
414 """Calculate a RawFileDatasetInfo from the supplied information.
416 Parameters
417 ----------
418 header : Mapping or `astro_metadata_translator.ObservationInfo`
419 Header from the dataset or previously-translated content.
420 filename : `ButlerURI`
421 Filename to use for error messages.
423 Returns
424 -------
425 dataset : `RawFileDatasetInfo`
426 The dataId, and observation information associated with this
427 dataset.
428 """
429 # To ensure we aren't slowed down for no reason, explicitly
430 # list here the properties we need for the schema.
431 # Use a dict with values a boolean where True indicates
432 # that it is required that we calculate this property.
433 ingest_subset = {
434 "altaz_begin": False,
435 "boresight_rotation_coord": False,
436 "boresight_rotation_angle": False,
437 "dark_time": False,
438 "datetime_begin": True,
439 "datetime_end": True,
440 "detector_num": True,
441 "exposure_group": False,
442 "exposure_id": True,
443 "exposure_time": True,
444 "instrument": True,
445 "tracking_radec": False,
446 "object": False,
447 "observation_counter": False,
448 "observation_id": True,
449 "observation_reason": False,
450 "observation_type": True,
451 "observing_day": False,
452 "physical_filter": True,
453 "science_program": False,
454 "visit_id": False,
455 }
457 if isinstance(header, ObservationInfo):
458 obsInfo = header
459 missing = []
460 # Need to check the required properties are present.
461 for property, required in ingest_subset.items():
462 if not required:
463 continue
464 # getattr does not need to be protected because it is using
465 # the defined list above containing properties that must exist.
466 value = getattr(obsInfo, property)
467 if value is None:
468 missing.append(property)
469 if missing:
470 raise ValueError(f"Requested required properties are missing from file {filename}:"
471 f" {missing} (via JSON)")
473 else:
474 obsInfo = ObservationInfo(header, pedantic=False, filename=str(filename),
475 required={k for k in ingest_subset if ingest_subset[k]},
476 subset=set(ingest_subset))
478 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
479 exposure=obsInfo.exposure_id,
480 detector=obsInfo.detector_num,
481 universe=self.universe)
482 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
484 def locateAndReadIndexFiles(self, files):
485 """Given a list of files, look for index files and read them.
487 Index files can either be explicitly in the list of files to
488 ingest, or else located in the same directory as a file to ingest.
489 Index entries are always used if present.
491 Parameters
492 ----------
493 files : iterable over `ButlerURI`
494 URIs to the files to be ingested.
496 Returns
497 -------
498 index : `dict` [`str`, Any]
499 Merged contents of all relevant index files found. These can
500 be explicitly specified index files or ones found in the
501 directory alongside a data file to be ingested.
502 updated_files : iterable of `str`
503 Updated list of the input files with entries removed that were
504 found listed in an index file. Order is not guaranteed to
505 match the order of the files given to this routine.
506 bad_index_files: `set[str]`
507 Files that looked like index files but failed to read properly.
508 """
509 # Convert the paths to absolute for easy comparison with index content.
510 # Do not convert to real paths since we have to assume that index
511 # files are in this location and not the location which it links to.
512 files = tuple(f.abspath() for f in files)
514 # Index files must be named this.
515 index_root_file = "_index.json"
517 # Group the files by directory.
518 files_by_directory = defaultdict(set)
520 for path in files:
521 directory, file_in_dir = path.split()
522 files_by_directory[directory].add(file_in_dir)
524 # All the metadata read from index files with keys of full path.
525 index_entries = {}
527 # Index files we failed to read.
528 bad_index_files = set()
530 # Any good index files that were found and used.
531 good_index_files = set()
533 # Look for index files in those directories.
534 for directory, files_in_directory in files_by_directory.items():
535 possible_index_file = directory.join(index_root_file)
536 if possible_index_file.exists():
537 # If we are explicitly requesting an index file the
538 # messages should be different.
539 index_msg = "inferred"
540 is_implied = True
541 if index_root_file in files_in_directory:
542 index_msg = "explicit"
543 is_implied = False
545 # Try to read the index file and catch and report any
546 # problems.
547 try:
548 content = json.loads(possible_index_file.read())
549 index = process_index_data(content, force_dict=True)
550 except Exception as e:
551 # Only trigger the callback if the index file
552 # was asked for explicitly. Triggering on implied file
553 # might be surprising.
554 if not is_implied:
555 self._on_metadata_failure(possible_index_file, e)
556 if self.config.failFast:
557 raise RuntimeError(f"Problem reading index file from {index_msg} "
558 f"location {possible_index_file}") from e
559 bad_index_files.add(possible_index_file)
560 continue
562 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
563 good_index_files.add(possible_index_file)
565 # Go through the index adding entries for files.
566 # If we have non-index files in this directory marked for
567 # ingest we should only get index information for those.
568 # If the index file was explicit we use all entries.
569 if is_implied:
570 files_to_ingest = files_in_directory
571 else:
572 files_to_ingest = set(index)
574 # Copy relevant metadata into a single dict for all index
575 # entries.
576 for file_in_dir in files_to_ingest:
577 # Skip an explicitly specified index file.
578 # This should never happen because an explicit index
579 # file will force ingest of all files in the index
580 # and not use the explicit file list. If somehow
581 # this is not true we continue. Raising an exception
582 # seems like the wrong thing to do since this is harmless.
583 if file_in_dir == index_root_file:
584 self.log.info("Logic error found scanning directory %s. Please file ticket.",
585 directory)
586 continue
587 if file_in_dir in index:
588 file = directory.join(file_in_dir)
589 if file in index_entries:
590 # ObservationInfo overrides raw metadata
591 if isinstance(index[file_in_dir], ObservationInfo) \
592 and not isinstance(index_entries[file], ObservationInfo):
593 self.log.warning("File %s already specified in an index file but overriding"
594 " with ObservationInfo content from %s",
595 file, possible_index_file)
596 else:
597 self.log.warning("File %s already specified in an index file, "
598 "ignoring content from %s", file, possible_index_file)
599 # Do nothing in this case
600 continue
602 index_entries[file] = index[file_in_dir]
604 # Remove files from list that have index entries and also
605 # any files that we determined to be explicit index files
606 # or any index files that we failed to read.
607 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
609 # The filtered list loses the initial order. Retaining the order
610 # is good for testing but does have a cost if there are many
611 # files when copying the good values out. A dict would have faster
612 # lookups (using the files as keys) but use more memory.
613 ordered = [f for f in filtered if f in files]
615 return index_entries, ordered, good_index_files, bad_index_files
617 def processIndexEntries(self, index_entries):
618 """Convert index entries to RawFileData.
620 Parameters
621 ----------
622 index_entries : `dict` [`str`, Any]
623 Dict indexed by name of file to ingest and with keys either
624 raw metadata or translated
625 `~astro_metadata_translator.ObservationInfo`.
627 Returns
628 -------
629 data : `RawFileData`
630 A structure containing the metadata extracted from the file,
631 as well as the original filename. All fields will be populated,
632 but the `RawFileData.dataId` attribute will be a minimal
633 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance.
634 """
635 fileData = []
636 for filename, metadata in index_entries.items():
637 try:
638 datasets = [self._calculate_dataset_info(metadata, filename)]
639 except Exception as e:
640 self.log.debug("Problem extracting metadata for file %s found in index file: %s",
641 filename, e)
642 datasets = []
643 formatterClass = Formatter
644 instrument = None
645 self._on_metadata_failure(filename, e)
646 if self.config.failFast:
647 raise RuntimeError(f"Problem extracting metadata for file {filename} "
648 "found in index file") from e
649 else:
650 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId,
651 filename)
652 if instrument is None:
653 datasets = []
654 fileData.append(RawFileData(datasets=datasets, filename=filename,
655 FormatterClass=formatterClass, instrument=instrument))
656 return fileData
658 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
659 """Group an iterable of `RawFileData` by exposure.
661 Parameters
662 ----------
663 files : iterable of `RawFileData`
664 File-level information to group.
666 Returns
667 -------
668 exposures : `list` of `RawExposureData`
669 A list of structures that group the file-level information by
670 exposure. All fields will be populated. The
671 `RawExposureData.dataId` attributes will be minimal (unexpanded)
672 `~lsst.daf.butler.DataCoordinate` instances.
673 """
674 exposureDimensions = self.universe["exposure"].graph
675 byExposure = defaultdict(list)
676 for f in files:
677 # Assume that the first dataset is representative for the file.
678 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
680 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
681 for dataId, exposureFiles in byExposure.items()]
683 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
684 """Expand the data IDs associated with a raw exposure.
686 This adds the metadata records.
688 Parameters
689 ----------
690 exposure : `RawExposureData`
691 A structure containing information about the exposure to be
692 ingested. Must have `RawExposureData.records` populated. Should
693 be considered consumed upon return.
695 Returns
696 -------
697 exposure : `RawExposureData`
698 An updated version of the input structure, with
699 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
700 updated to data IDs for which
701 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
702 """
703 # We start by expanded the exposure-level data ID; we won't use that
704 # directly in file ingest, but this lets us do some database lookups
705 # once per exposure instead of once per file later.
706 data.dataId = self.butler.registry.expandDataId(
707 data.dataId,
708 # We pass in the records we'll be inserting shortly so they aren't
709 # looked up from the database. We do expect instrument and filter
710 # records to be retrieved from the database here (though the
711 # Registry may cache them so there isn't a lookup every time).
712 records={
713 self.butler.registry.dimensions["exposure"]: data.record,
714 }
715 )
716 # Now we expand the per-file (exposure+detector) data IDs. This time
717 # we pass in the records we just retrieved from the exposure data ID
718 # expansion.
719 for file in data.files:
720 for dataset in file.datasets:
721 dataset.dataId = self.butler.registry.expandDataId(
722 dataset.dataId,
723 records=dict(data.dataId.records)
724 )
725 return data
727 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1
728 ) -> Tuple[Iterator[RawExposureData], List[str]]:
729 """Perform all non-database-updating ingest preprocessing steps.
731 Parameters
732 ----------
733 files : iterable over `str` or path-like objects
734 Paths to the files to be ingested. Will be made absolute
735 if they are not already.
736 pool : `multiprocessing.Pool`, optional
737 If not `None`, a process pool with which to parallelize some
738 operations.
739 processes : `int`, optional
740 The number of processes to use. Ignored if ``pool`` is not `None`.
742 Returns
743 -------
744 exposures : `Iterator` [ `RawExposureData` ]
745 Data structures containing dimension records, filenames, and data
746 IDs to be ingested (one structure for each exposure).
747 bad_files : `list` of `str`
748 List of all the files that could not have metadata extracted.
749 """
750 if pool is None and processes > 1:
751 pool = Pool(processes)
752 mapFunc = map if pool is None else pool.imap_unordered
754 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]:
755 """Filter out bad files and return good with list of bad."""
756 good_files = []
757 bad_files = []
758 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata", total=len(files)):
759 if not fileDatum.datasets:
760 bad_files.append(fileDatum.filename)
761 else:
762 good_files.append(fileDatum)
763 return good_files, bad_files
765 # Look for index files and read them.
766 # There should be far fewer index files than data files.
767 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
768 if bad_index_files:
769 self.log.info("Failed to read the following explicitly requested index files:"),
770 for bad in sorted(bad_index_files):
771 self.log.info("- %s", bad)
773 # Now convert all the index file entries to standard form for ingest.
774 bad_index_file_data = []
775 indexFileData = self.processIndexEntries(index_entries)
776 if indexFileData:
777 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData)
778 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s"
779 " with %d failure%s",
780 *_log_msg_counter(indexFileData),
781 *_log_msg_counter(good_index_files),
782 *_log_msg_counter(bad_index_file_data))
784 # Extract metadata and build per-detector regions.
785 # This could run in a subprocess so collect all output
786 # before looking at failures.
787 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
789 # Filter out all the failed reads and store them for later
790 # reporting.
791 fileData, bad_files = _partition_good_bad(fileData)
792 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s",
793 *_log_msg_counter(fileData),
794 *_log_msg_counter(bad_files))
796 # Combine with data from index files.
797 fileData.extend(indexFileData)
798 bad_files.extend(bad_index_file_data)
799 bad_files.extend(bad_index_files)
801 # Use that metadata to group files (and extracted metadata) by
802 # exposure. Never parallelized because it's intrinsically a gather
803 # step.
804 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
806 # The next operation operates on RawExposureData instances (one at
807 # a time) in-place and then returns the modified instance. We call it
808 # as a pass-through instead of relying on the arguments we pass in to
809 # have been modified because in the parallel case those arguments are
810 # going to be pickled and unpickled, and I'm not certain
811 # multiprocessing is careful enough with that for output arguments to
812 # work.
814 # Expand the data IDs to include all dimension metadata; we need this
815 # because we may need to generate path templates that rely on that
816 # metadata.
817 # This is the first step that involves actual database calls (but just
818 # SELECTs), so if there's going to be a problem with connections vs.
819 # multiple processes, or lock contention (in SQLite) slowing things
820 # down, it'll happen here.
821 return mapFunc(self.expandDataIds, exposureData), bad_files
823 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
824 ) -> List[FileDataset]:
825 """Ingest all raw files in one exposure.
827 Parameters
828 ----------
829 exposure : `RawExposureData`
830 A structure containing information about the exposure to be
831 ingested. Must have `RawExposureData.records` populated and all
832 data ID attributes expanded.
833 run : `str`, optional
834 Name of a RUN-type collection to write to, overriding
835 ``self.butler.run``.
837 Returns
838 -------
839 datasets : `list` of `lsst.daf.butler.FileDataset`
840 Per-file structures identifying the files ingested and their
841 dataset representation in the data repository.
842 """
843 datasets = [FileDataset(path=file.filename.abspath(),
844 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
845 formatter=file.FormatterClass)
846 for file in exposure.files]
848 # Raw files are preferentially ingested using a UUID derived from
849 # the collection name and dataId.
850 # We do not know if this registry can support UUID so try it
851 # and fall back to the UNIQUE option if that fails.
852 try:
853 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run,
854 idGenerationMode=DatasetIdGenEnum.DATAID_TYPE_RUN)
855 except UnsupportedIdGeneratorError:
856 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run,
857 idGenerationMode=DatasetIdGenEnum.UNIQUE)
858 return datasets
860 def ingestFiles(self, files, *, pool: Optional[Pool] = None, processes: int = 1,
861 run: Optional[str] = None,
862 skip_existing_exposures: bool = False,
863 update_exposure_records: bool = False):
864 """Ingest files into a Butler data repository.
866 This creates any new exposure or visit Dimension entries needed to
867 identify the ingested files, creates new Dataset entries in the
868 Registry and finally ingests the files themselves into the Datastore.
869 Any needed instrument, detector, and physical_filter Dimension entries
870 must exist in the Registry before `run` is called.
872 Parameters
873 ----------
874 files : iterable over `ButlerURI`
875 URIs to the files to be ingested.
876 pool : `multiprocessing.Pool`, optional
877 If not `None`, a process pool with which to parallelize some
878 operations.
879 processes : `int`, optional
880 The number of processes to use. Ignored if ``pool`` is not `None`.
881 run : `str`, optional
882 Name of a RUN-type collection to write to, overriding
883 the default derived from the instrument name.
884 skip_existing_exposures : `bool`, optional
885 If `True` (`False` is default), skip ingestion for any files for
886 which the exposure record already exists (even if this is only
887 because other raws from the same exposure have been ingested).
888 Note that this is much slower than just not passing
889 already-ingested files as inputs, because we still need to read and
890 process metadata to identify which exposures to search for.
891 update_exposure_records : `bool`, optional
892 If `True` (`False` is default), update existing exposure records
893 that conflict with the new ones instead of rejecting them. THIS IS
894 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
895 KNOWN TO BE BAD. This should usually be combined with
896 ``skip_existing_exposures=True``.
898 Returns
899 -------
900 refs : `list` of `lsst.daf.butler.DatasetRef`
901 Dataset references for ingested raws.
902 """
904 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
906 # Up to this point, we haven't modified the data repository at all.
907 # Now we finally do that, with one transaction per exposure. This is
908 # not parallelized at present because the performance of this step is
909 # limited by the database server. That may or may not change in the
910 # future once we increase our usage of bulk inserts and reduce our
911 # usage of savepoints; we've tried to get everything but the database
912 # operations done in advance to reduce the time spent inside
913 # transactions.
914 self.butler.registry.registerDatasetType(self.datasetType)
916 refs = []
917 runs = set()
918 n_exposures = 0
919 n_exposures_failed = 0
920 n_ingests_failed = 0
921 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
923 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s",
924 *_log_msg_counter(exposure.files),
925 exposure.record.instrument, exposure.record.obs_id)
927 try:
928 inserted_or_updated = self.butler.registry.syncDimensionData(
929 "exposure",
930 exposure.record,
931 update=update_exposure_records,
932 )
933 except Exception as e:
934 self._on_ingest_failure(exposure, e)
935 n_exposures_failed += 1
936 self.log.warning("Exposure %s:%s could not be registered: %s",
937 exposure.record.instrument, exposure.record.obs_id, e)
938 if self.config.failFast:
939 raise e
940 continue
942 if isinstance(inserted_or_updated, dict):
943 # Exposure is in the registry and we updated it, so
944 # syncDimensionData returned a dict.
945 self.log.info(
946 "Exposure %s:%s was already present, but columns %s were updated.",
947 exposure.record.instrument,
948 exposure.record.obs_id,
949 str(list(inserted_or_updated.keys()))
950 )
951 if skip_existing_exposures:
952 continue
953 elif not inserted_or_updated and skip_existing_exposures:
954 # Exposure is already in the registry, with the right metadata,
955 # and we're configured to skip file ingest if that's the case.
956 self.log.info(
957 "Exposure %s:%s was already present; skipping file ingestion as requested.",
958 exposure.record.instrument,
959 exposure.record.obs_id,
960 )
961 continue
963 # Override default run if nothing specified explicitly.
964 if run is None:
965 instrument = exposure.files[0].instrument
966 this_run = instrument.makeDefaultRawIngestRunName()
967 else:
968 this_run = run
969 if this_run not in runs:
970 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
971 runs.add(this_run)
972 try:
973 datasets_for_exposure = self.ingestExposureDatasets(exposure, run=this_run)
974 except Exception as e:
975 self._on_ingest_failure(exposure, e)
976 n_ingests_failed += 1
977 self.log.warning("Failed to ingest the following for reason: %s", e)
978 for f in exposure.files:
979 self.log.warning("- %s", f.filename)
980 if self.config.failFast:
981 raise e
982 continue
983 else:
984 self._on_success(datasets_for_exposure)
985 for dataset in datasets_for_exposure:
986 refs.extend(dataset.refs)
988 # Success for this exposure.
989 n_exposures += 1
990 self.log.info("Exposure %s:%s ingested successfully",
991 exposure.record.instrument, exposure.record.obs_id)
993 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
995 @timeMethod
996 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None,
997 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", group_files: bool = True,
998 skip_existing_exposures: bool = False, update_exposure_records: bool = False):
999 """Ingest files into a Butler data repository.
1001 This creates any new exposure or visit Dimension entries needed to
1002 identify the ingested files, creates new Dataset entries in the
1003 Registry and finally ingests the files themselves into the Datastore.
1004 Any needed instrument, detector, and physical_filter Dimension entries
1005 must exist in the Registry before `run` is called.
1007 Parameters
1008 ----------
1009 files : iterable over `ButlerURI`, `str` or path-like objects
1010 Paths to the files to be ingested. Can refer to directories.
1011 Will be made absolute if they are not already.
1012 pool : `multiprocessing.Pool`, optional
1013 If not `None`, a process pool with which to parallelize some
1014 operations.
1015 processes : `int`, optional
1016 The number of processes to use. Ignored if ``pool`` is not `None`.
1017 run : `str`, optional
1018 Name of a RUN-type collection to write to, overriding
1019 the default derived from the instrument name.
1020 file_filter : `str` or `re.Pattern`, optional
1021 Pattern to use to discover files to ingest within directories.
1022 The default is to search for FITS files. The regex applies to
1023 files within the directory.
1024 group_files : `bool`, optional
1025 Group files by directory if they have been discovered in
1026 directories. Will not affect files explicitly provided.
1027 skip_existing_exposures : `bool`, optional
1028 If `True` (`False` is default), skip ingestion for any files for
1029 which the exposure record already exists (even if this is only
1030 because other raws from the same exposure have been ingested).
1031 Note that this is much slower than just not passing
1032 already-ingested files as inputs, because we still need to read and
1033 process metadata to identify which exposures to search for.
1034 update_exposure_records : `bool`, optional
1035 If `True` (`False` is default), update existing exposure records
1036 that conflict with the new ones instead of rejecting them. THIS IS
1037 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1038 KNOWN TO BE BAD. This should usually be combined with
1039 ``skip_existing_exposures=True``.
1041 Returns
1042 -------
1043 refs : `list` of `lsst.daf.butler.DatasetRef`
1044 Dataset references for ingested raws.
1046 Notes
1047 -----
1048 This method inserts all datasets for an exposure within a transaction,
1049 guaranteeing that partial exposures are never ingested. The exposure
1050 dimension record is inserted with `Registry.syncDimensionData` first
1051 (in its own transaction), which inserts only if a record with the same
1052 primary key does not already exist. This allows different files within
1053 the same exposure to be ingested in different runs.
1054 """
1056 refs = []
1057 bad_files = []
1058 n_exposures = 0
1059 n_exposures_failed = 0
1060 n_ingests_failed = 0
1061 if group_files:
1062 for group in ButlerURI.findFileResources(files, file_filter, group_files):
1063 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1064 group,
1065 pool=pool,
1066 processes=processes,
1067 run=run,
1068 skip_existing_exposures=skip_existing_exposures,
1069 update_exposure_records=update_exposure_records,
1070 )
1071 refs.extend(new_refs)
1072 bad_files.extend(bad)
1073 n_exposures += n_exp
1074 n_exposures_failed += n_exp_fail
1075 n_ingests_failed += n_ingest_fail
1076 else:
1077 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1078 ButlerURI.findFileResources(files, file_filter, group_files),
1079 pool=pool,
1080 processes=processes,
1081 run=run,
1082 skip_existing_exposures=skip_existing_exposures,
1083 update_exposure_records=update_exposure_records,
1084 )
1086 had_failure = False
1088 if bad_files:
1089 had_failure = True
1090 self.log.warning("Could not extract observation metadata from the following:")
1091 for f in bad_files:
1092 self.log.warning("- %s", f)
1094 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure"
1095 " registration and %d failure%s from file ingest.",
1096 *_log_msg_counter(n_exposures),
1097 *_log_msg_counter(n_exposures_failed),
1098 *_log_msg_counter(n_ingests_failed))
1099 if n_exposures_failed > 0 or n_ingests_failed > 0:
1100 had_failure = True
1101 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1103 if had_failure:
1104 raise RuntimeError("Some failures encountered during ingestion")
1106 return refs