Coverage for python/lsst/obs/base/ingest.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from dataclasses import dataclass, InitVar
28from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union
29from collections import defaultdict
30from multiprocessing import Pool
32from astro_metadata_translator import ObservationInfo, merge_headers
33from astro_metadata_translator.indexing import process_sidecar_data, process_index_data
34from lsst.afw.fits import readMetadata
35from lsst.daf.butler import (
36 Butler,
37 ButlerURI,
38 CollectionType,
39 DataCoordinate,
40 DatasetRef,
41 DatasetType,
42 DimensionRecord,
43 DimensionUniverse,
44 FileDataset,
45 Formatter,
46 Progress,
47)
48from lsst.pex.config import Config, ChoiceField, Field
49from lsst.pipe.base import Task, timeMethod
51from ._instrument import Instrument, makeExposureRecordFromObsInfo
52from ._fitsRawFormatterBase import FitsRawFormatterBase
55def _do_nothing(*args, **kwargs) -> None:
56 """Do nothing.
58 This is a function that accepts anything and does nothing.
59 For use as a default in callback arguments.
60 """
61 pass
64def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]:
65 """Count the iterable and return the count and plural modifier.
67 Parameters
68 ----------
69 noun : Iterable or `int`
70 Thing to count. If given an integer it is assumed to be the count
71 to use to calculate modifier.
73 Returns
74 -------
75 num : `int`
76 Number of items found in ``noun``.
77 modifier : `str`
78 Character to add to the end of a string referring to these items
79 to indicate whether it was a single item or not. Returns empty
80 string if there is one item or "s" otherwise.
82 Examples
83 --------
85 .. code-block:: python
87 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
88 """
89 if isinstance(noun, int):
90 num = noun
91 else:
92 num = len(noun)
93 return num, "" if num == 1 else "s"
96@dataclass
97class RawFileDatasetInfo:
98 """Information about a single dataset within a raw file."""
100 dataId: DataCoordinate
101 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
103 obsInfo: ObservationInfo
104 """Standardized observation metadata extracted directly from the file
105 headers (`astro_metadata_translator.ObservationInfo`).
106 """
109@dataclass
110class RawFileData:
111 """Information about a single raw file, used during ingest."""
113 datasets: List[RawFileDatasetInfo]
114 """The information describing each dataset within this raw file.
115 (`list` of `RawFileDatasetInfo`)
116 """
118 filename: ButlerURI
119 """URI of the file this information was extracted from (`str`).
121 This is the path prior to ingest, not the path after ingest.
122 """
124 FormatterClass: Type[FitsRawFormatterBase]
125 """Formatter class that should be used to ingest this file (`type`; as
126 subclass of `FitsRawFormatterBase`).
127 """
129 instrumentClass: Optional[Type[Instrument]]
130 """The `Instrument` class associated with this file. Can be `None`
131 if ``datasets`` is an empty list."""
134@dataclass
135class RawExposureData:
136 """Information about a complete raw exposure, used during ingest."""
138 dataId: DataCoordinate
139 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
140 """
142 files: List[RawFileData]
143 """List of structures containing file-level information.
144 """
146 universe: InitVar[DimensionUniverse]
147 """Set of all known dimensions.
148 """
150 record: Optional[DimensionRecord] = None
151 """The exposure `DimensionRecord` that must be inserted into the
152 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
153 """
155 def __post_init__(self, universe: DimensionUniverse):
156 # We don't care which file or dataset we read metadata from, because
157 # we're assuming they'll all be the same; just use the first ones.
158 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
161def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
162 """Create a Config field with options for transferring data between repos.
164 The allowed options for the field are exactly those supported by
165 `lsst.daf.butler.Datastore.ingest`.
167 Parameters
168 ----------
169 doc : `str`
170 Documentation for the configuration field.
172 Returns
173 -------
174 field : `lsst.pex.config.ChoiceField`
175 Configuration field.
176 """
177 return ChoiceField(
178 doc=doc,
179 dtype=str,
180 allowed={"move": "move",
181 "copy": "copy",
182 "auto": "choice will depend on datastore",
183 "direct": "use URI to ingested file directly in datastore",
184 "link": "hard link falling back to symbolic link",
185 "hardlink": "hard link",
186 "symlink": "symbolic (soft) link",
187 "relsymlink": "relative symbolic link",
188 },
189 optional=True,
190 default=default
191 )
194class RawIngestConfig(Config):
195 """Configuration class for RawIngestTask."""
197 transfer = makeTransferChoiceField()
198 failFast = Field(
199 dtype=bool,
200 default=False,
201 doc="If True, stop ingest as soon as any problem is encountered with any file. "
202 "Otherwise problems files will be skipped and logged and a report issued at completion.",
203 )
206class RawIngestTask(Task):
207 """Driver Task for ingesting raw data into Gen3 Butler repositories.
209 Parameters
210 ----------
211 config : `RawIngestConfig`
212 Configuration for the task.
213 butler : `~lsst.daf.butler.Butler`
214 Writeable butler instance, with ``butler.run`` set to the appropriate
215 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
216 datasets.
217 on_success : `Callable`, optional
218 A callback invoked when all of the raws associated with an exposure
219 are ingested. Will be passed a list of `FileDataset` objects, each
220 containing one or more resolved `DatasetRef` objects. If this callback
221 raises it will interrupt the entire ingest process, even if
222 `RawIngestConfig.failFast` is `False`.
223 on_metadata_failure : `Callable`, optional
224 A callback invoked when a failure occurs trying to translate the
225 metadata for a file. Will be passed the URI and the exception, in
226 that order, as positional arguments. Guaranteed to be called in an
227 ``except`` block, allowing the callback to re-raise or replace (with
228 ``raise ... from``) to override the task's usual error handling (before
229 `RawIngestConfig.failFast` logic occurs).
230 on_ingest_failure : `Callable`, optional
231 A callback invoked when dimension record or dataset insertion into the
232 database fails for an exposure. Will be passed a `RawExposureData`
233 instance and the exception, in that order, as positional arguments.
234 Guaranteed to be called in an ``except`` block, allowing the callback
235 to re-raise or replace (with ``raise ... from``) to override the task's
236 usual error handling (before `RawIngestConfig.failFast` logic occurs).
237 **kwargs
238 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
239 constructor.
241 Notes
242 -----
243 Each instance of `RawIngestTask` writes to the same Butler. Each
244 invocation of `RawIngestTask.run` ingests a list of files.
245 """
247 ConfigClass = RawIngestConfig
249 _DefaultName = "ingest"
251 def getDatasetType(self):
252 """Return the DatasetType of the datasets ingested by this Task."""
253 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
254 universe=self.butler.registry.dimensions)
256 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler,
257 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
258 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing,
259 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
260 **kwargs: Any):
261 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
262 super().__init__(config, **kwargs)
263 self.butler = butler
264 self.universe = self.butler.registry.dimensions
265 self.datasetType = self.getDatasetType()
266 self._on_success = on_success
267 self._on_metadata_failure = on_metadata_failure
268 self._on_ingest_failure = on_ingest_failure
269 self.progress = Progress("obs.base.RawIngestTask")
271 # Import all the instrument classes so that we ensure that we
272 # have all the relevant metadata translators loaded.
273 Instrument.importAll(self.butler.registry)
275 def _reduce_kwargs(self):
276 # Add extra parameters to pickle.
277 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success,
278 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure)
280 def _determine_instrument_formatter(self, dataId, filename):
281 """Determine the instrument and formatter class.
283 Parameters
284 ----------
285 dataId : `lsst.daf.butler.DataCoordinate`
286 The dataId associated with this dataset.
287 filename : `ButlerURI`
288 URI of file used for error reporting.
290 Returns
291 -------
292 instrument : `Instrument` or `None`
293 Instance of the `Instrument` associated with this dataset. `None`
294 indicates that the instrument could not be determined.
295 formatterClass : `type`
296 Class to be used as the formatter for this dataset.
297 """
298 # The data model currently assumes that whilst multiple datasets
299 # can be associated with a single file, they must all share the
300 # same formatter.
301 try:
302 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry)
303 except LookupError as e:
304 self._on_metadata_failure(filename, e)
305 self.log.warning("Instrument %s for file %s not known to registry",
306 dataId["instrument"], filename)
307 if self.config.failFast:
308 raise RuntimeError(f"Instrument {dataId['instrument']} for"
309 f" file {filename} not known to registry") from e
310 FormatterClass = Formatter
311 # Indicate that we could not work out the instrument.
312 instrument = None
313 else:
314 FormatterClass = instrument.getRawFormatter(dataId)
315 return instrument, FormatterClass
317 def extractMetadata(self, filename: ButlerURI) -> RawFileData:
318 """Extract and process metadata from a single raw file.
320 Parameters
321 ----------
322 filename : `ButlerURI`
323 URI to the file.
325 Returns
326 -------
327 data : `RawFileData`
328 A structure containing the metadata extracted from the file,
329 as well as the original filename. All fields will be populated,
330 but the `RawFileData.dataId` attribute will be a minimal
331 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
332 ``instrumentClass`` field will be `None` if there is a problem
333 with metadata extraction.
335 Notes
336 -----
337 Assumes that there is a single dataset associated with the given
338 file. Instruments using a single file to store multiple datasets
339 must implement their own version of this method.
341 By default the method will catch all exceptions unless the ``failFast``
342 configuration item is `True`. If an error is encountered the
343 `_on_metadata_failure()` method will be called. If no exceptions
344 result and an error was encountered the returned object will have
345 a null-instrument class and no datasets.
347 This method supports sidecar JSON files which can be used to
348 extract metadata without having to read the data file itself.
349 The sidecar file is always used if found.
350 """
351 sidecar_fail_msg = "" # Requires prepended space when set.
352 try:
353 sidecar_file = filename.updatedExtension(".json")
354 if sidecar_file.exists():
355 content = json.loads(sidecar_file.read())
356 header = process_sidecar_data(content)
357 sidecar_fail_msg = " (via sidecar)"
358 else:
359 # Read the metadata from the data file itself.
360 # Manually merge the primary and "first data" headers here
361 # because we do not know in general if an input file has
362 # set INHERIT=T.
363 # For remote files download the entire file to get the
364 # header. This is very inefficient and it would be better
365 # to have some way of knowing where in the file the headers
366 # are and to only download those parts of the file.
367 with filename.as_local() as local_file:
368 phdu = readMetadata(local_file.ospath, 0)
369 header = merge_headers([phdu, readMetadata(local_file.ospath)], mode="overwrite")
370 datasets = [self._calculate_dataset_info(header, filename)]
371 except Exception as e:
372 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
373 # Indicate to the caller that we failed to read.
374 datasets = []
375 formatterClass = Formatter
376 instrument = None
377 self._on_metadata_failure(filename, e)
378 if self.config.failFast:
379 raise RuntimeError("Problem extracting metadata for file "
380 f"{filename}{sidecar_fail_msg}") from e
381 else:
382 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
383 # The data model currently assumes that whilst multiple datasets
384 # can be associated with a single file, they must all share the
385 # same formatter.
386 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
387 if instrument is None:
388 datasets = []
390 return RawFileData(datasets=datasets, filename=filename,
391 FormatterClass=formatterClass,
392 instrumentClass=instrument)
394 def _calculate_dataset_info(self, header, filename):
395 """Calculate a RawFileDatasetInfo from the supplied information.
397 Parameters
398 ----------
399 header : Mapping or `astro_metadata_translator.ObservationInfo`
400 Header from the dataset or previously-translated content.
401 filename : `ButlerURI`
402 Filename to use for error messages.
404 Returns
405 -------
406 dataset : `RawFileDatasetInfo`
407 The dataId, and observation information associated with this
408 dataset.
409 """
410 # To ensure we aren't slowed down for no reason, explicitly
411 # list here the properties we need for the schema.
412 # Use a dict with values a boolean where True indicates
413 # that it is required that we calculate this property.
414 ingest_subset = {
415 "altaz_begin": False,
416 "boresight_rotation_coord": False,
417 "boresight_rotation_angle": False,
418 "dark_time": False,
419 "datetime_begin": True,
420 "datetime_end": True,
421 "detector_num": True,
422 "exposure_group": False,
423 "exposure_id": True,
424 "exposure_time": True,
425 "instrument": True,
426 "tracking_radec": False,
427 "object": False,
428 "observation_counter": False,
429 "observation_id": True,
430 "observation_reason": False,
431 "observation_type": True,
432 "observing_day": False,
433 "physical_filter": True,
434 "science_program": False,
435 "visit_id": False,
436 }
438 if isinstance(header, ObservationInfo):
439 obsInfo = header
440 missing = []
441 # Need to check the required properties are present.
442 for property, required in ingest_subset.items():
443 if not required:
444 continue
445 # getattr does not need to be protected because it is using
446 # the defined list above containing properties that must exist.
447 value = getattr(obsInfo, property)
448 if value is None:
449 missing.append(property)
450 if missing:
451 raise ValueError(f"Requested required properties are missing from file {filename}:"
452 f" {missing} (via JSON)")
454 else:
455 obsInfo = ObservationInfo(header, pedantic=False, filename=str(filename),
456 required={k for k in ingest_subset if ingest_subset[k]},
457 subset=set(ingest_subset))
459 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
460 exposure=obsInfo.exposure_id,
461 detector=obsInfo.detector_num,
462 universe=self.universe)
463 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
465 def locateAndReadIndexFiles(self, files):
466 """Given a list of files, look for index files and read them.
468 Index files can either be explicitly in the list of files to
469 ingest, or else located in the same directory as a file to ingest.
470 Index entries are always used if present.
472 Parameters
473 ----------
474 files : iterable over `ButlerURI`
475 URIs to the files to be ingested.
477 Returns
478 -------
479 index : `dict` [`str`, Any]
480 Merged contents of all relevant index files found. These can
481 be explicitly specified index files or ones found in the
482 directory alongside a data file to be ingested.
483 updated_files : iterable of `str`
484 Updated list of the input files with entries removed that were
485 found listed in an index file. Order is not guaranteed to
486 match the order of the files given to this routine.
487 bad_index_files: `set[str]`
488 Files that looked like index files but failed to read properly.
489 """
490 # Convert the paths to absolute for easy comparison with index content.
491 # Do not convert to real paths since we have to assume that index
492 # files are in this location and not the location which it links to.
493 files = tuple(f.abspath() for f in files)
495 # Index files must be named this.
496 index_root_file = "_index.json"
498 # Group the files by directory.
499 files_by_directory = defaultdict(set)
501 for path in files:
502 directory, file_in_dir = path.split()
503 files_by_directory[directory].add(file_in_dir)
505 # All the metadata read from index files with keys of full path.
506 index_entries = {}
508 # Index files we failed to read.
509 bad_index_files = set()
511 # Any good index files that were found and used.
512 good_index_files = set()
514 # Look for index files in those directories.
515 for directory, files_in_directory in files_by_directory.items():
516 possible_index_file = directory.join(index_root_file)
517 if possible_index_file.exists():
518 # If we are explicitly requesting an index file the
519 # messages should be different.
520 index_msg = "inferred"
521 is_implied = True
522 if index_root_file in files_in_directory:
523 index_msg = "explicit"
524 is_implied = False
526 # Try to read the index file and catch and report any
527 # problems.
528 try:
529 content = json.loads(possible_index_file.read())
530 index = process_index_data(content, force_dict=True)
531 except Exception as e:
532 # Only trigger the callback if the index file
533 # was asked for explicitly. Triggering on implied file
534 # might be surprising.
535 if not is_implied:
536 self._on_metadata_failure(possible_index_file, e)
537 if self.config.failFast:
538 raise RuntimeError(f"Problem reading index file from {index_msg} "
539 f"location {possible_index_file}") from e
540 bad_index_files.add(possible_index_file)
541 continue
543 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
544 good_index_files.add(possible_index_file)
546 # Go through the index adding entries for files.
547 # If we have non-index files in this directory marked for
548 # ingest we should only get index information for those.
549 # If the index file was explicit we use all entries.
550 if is_implied:
551 files_to_ingest = files_in_directory
552 else:
553 files_to_ingest = set(index)
555 # Copy relevant metadata into a single dict for all index
556 # entries.
557 for file_in_dir in files_to_ingest:
558 # Skip an explicitly specified index file.
559 # This should never happen because an explicit index
560 # file will force ingest of all files in the index
561 # and not use the explicit file list. If somehow
562 # this is not true we continue. Raising an exception
563 # seems like the wrong thing to do since this is harmless.
564 if file_in_dir == index_root_file:
565 self.log.info("Logic error found scanning directory %s. Please file ticket.",
566 directory)
567 continue
568 if file_in_dir in index:
569 file = directory.join(file_in_dir)
570 if file in index_entries:
571 # ObservationInfo overrides raw metadata
572 if isinstance(index[file_in_dir], ObservationInfo) \
573 and not isinstance(index_entries[file], ObservationInfo):
574 self.log.warning("File %s already specified in an index file but overriding"
575 " with ObservationInfo content from %s",
576 file, possible_index_file)
577 else:
578 self.log.warning("File %s already specified in an index file, "
579 "ignoring content from %s", file, possible_index_file)
580 # Do nothing in this case
581 continue
583 index_entries[file] = index[file_in_dir]
585 # Remove files from list that have index entries and also
586 # any files that we determined to be explicit index files
587 # or any index files that we failed to read.
588 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
590 # The filtered list loses the initial order. Retaining the order
591 # is good for testing but does have a cost if there are many
592 # files when copying the good values out. A dict would have faster
593 # lookups (using the files as keys) but use more memory.
594 ordered = [f for f in filtered if f in files]
596 return index_entries, ordered, good_index_files, bad_index_files
598 def processIndexEntries(self, index_entries):
599 """Convert index entries to RawFileData.
601 Parameters
602 ----------
603 index_entries : `dict` [`str`, Any]
604 Dict indexed by name of file to ingest and with keys either
605 raw metadata or translated
606 `~astro_metadata_translator.ObservationInfo`.
608 Returns
609 -------
610 data : `RawFileData`
611 A structure containing the metadata extracted from the file,
612 as well as the original filename. All fields will be populated,
613 but the `RawFileData.dataId` attribute will be a minimal
614 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance.
615 """
616 fileData = []
617 for filename, metadata in index_entries.items():
618 try:
619 datasets = [self._calculate_dataset_info(metadata, filename)]
620 except Exception as e:
621 self.log.debug("Problem extracting metadata for file %s found in index file: %s",
622 filename, e)
623 datasets = []
624 formatterClass = Formatter
625 instrument = None
626 self._on_metadata_failure(filename, e)
627 if self.config.failFast:
628 raise RuntimeError(f"Problem extracting metadata for file {filename} "
629 "found in index file") from e
630 else:
631 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId,
632 filename)
633 if instrument is None:
634 datasets = []
635 fileData.append(RawFileData(datasets=datasets, filename=filename,
636 FormatterClass=formatterClass, instrumentClass=instrument))
637 return fileData
639 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
640 """Group an iterable of `RawFileData` by exposure.
642 Parameters
643 ----------
644 files : iterable of `RawFileData`
645 File-level information to group.
647 Returns
648 -------
649 exposures : `list` of `RawExposureData`
650 A list of structures that group the file-level information by
651 exposure. All fields will be populated. The
652 `RawExposureData.dataId` attributes will be minimal (unexpanded)
653 `~lsst.daf.butler.DataCoordinate` instances.
654 """
655 exposureDimensions = self.universe["exposure"].graph
656 byExposure = defaultdict(list)
657 for f in files:
658 # Assume that the first dataset is representative for the file.
659 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
661 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
662 for dataId, exposureFiles in byExposure.items()]
664 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
665 """Expand the data IDs associated with a raw exposure.
667 This adds the metadata records.
669 Parameters
670 ----------
671 exposure : `RawExposureData`
672 A structure containing information about the exposure to be
673 ingested. Must have `RawExposureData.records` populated. Should
674 be considered consumed upon return.
676 Returns
677 -------
678 exposure : `RawExposureData`
679 An updated version of the input structure, with
680 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
681 updated to data IDs for which
682 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
683 """
684 # We start by expanded the exposure-level data ID; we won't use that
685 # directly in file ingest, but this lets us do some database lookups
686 # once per exposure instead of once per file later.
687 data.dataId = self.butler.registry.expandDataId(
688 data.dataId,
689 # We pass in the records we'll be inserting shortly so they aren't
690 # looked up from the database. We do expect instrument and filter
691 # records to be retrieved from the database here (though the
692 # Registry may cache them so there isn't a lookup every time).
693 records={
694 self.butler.registry.dimensions["exposure"]: data.record,
695 }
696 )
697 # Now we expand the per-file (exposure+detector) data IDs. This time
698 # we pass in the records we just retrieved from the exposure data ID
699 # expansion.
700 for file in data.files:
701 for dataset in file.datasets:
702 dataset.dataId = self.butler.registry.expandDataId(
703 dataset.dataId,
704 records=dict(data.dataId.records)
705 )
706 return data
708 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1
709 ) -> Tuple[Iterator[RawExposureData], List[str]]:
710 """Perform all non-database-updating ingest preprocessing steps.
712 Parameters
713 ----------
714 files : iterable over `str` or path-like objects
715 Paths to the files to be ingested. Will be made absolute
716 if they are not already.
717 pool : `multiprocessing.Pool`, optional
718 If not `None`, a process pool with which to parallelize some
719 operations.
720 processes : `int`, optional
721 The number of processes to use. Ignored if ``pool`` is not `None`.
723 Returns
724 -------
725 exposures : `Iterator` [ `RawExposureData` ]
726 Data structures containing dimension records, filenames, and data
727 IDs to be ingested (one structure for each exposure).
728 bad_files : `list` of `str`
729 List of all the files that could not have metadata extracted.
730 """
731 if pool is None and processes > 1:
732 pool = Pool(processes)
733 mapFunc = map if pool is None else pool.imap_unordered
735 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]:
736 """Filter out bad files and return good with list of bad."""
737 good_files = []
738 bad_files = []
739 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata", total=len(files)):
740 if not fileDatum.datasets:
741 bad_files.append(fileDatum.filename)
742 else:
743 good_files.append(fileDatum)
744 return good_files, bad_files
746 # Look for index files and read them.
747 # There should be far fewer index files than data files.
748 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
749 if bad_index_files:
750 self.log.info("Failed to read the following explicitly requested index files:"),
751 for bad in sorted(bad_index_files):
752 self.log.info("- %s", bad)
754 # Now convert all the index file entries to standard form for ingest.
755 bad_index_file_data = []
756 indexFileData = self.processIndexEntries(index_entries)
757 if indexFileData:
758 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData)
759 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s"
760 " with %d failure%s",
761 *_log_msg_counter(indexFileData),
762 *_log_msg_counter(good_index_files),
763 *_log_msg_counter(bad_index_file_data))
765 # Extract metadata and build per-detector regions.
766 # This could run in a subprocess so collect all output
767 # before looking at failures.
768 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
770 # Filter out all the failed reads and store them for later
771 # reporting.
772 fileData, bad_files = _partition_good_bad(fileData)
773 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s",
774 *_log_msg_counter(fileData),
775 *_log_msg_counter(bad_files))
777 # Combine with data from index files.
778 fileData.extend(indexFileData)
779 bad_files.extend(bad_index_file_data)
780 bad_files.extend(bad_index_files)
782 # Use that metadata to group files (and extracted metadata) by
783 # exposure. Never parallelized because it's intrinsically a gather
784 # step.
785 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
787 # The next operation operates on RawExposureData instances (one at
788 # a time) in-place and then returns the modified instance. We call it
789 # as a pass-through instead of relying on the arguments we pass in to
790 # have been modified because in the parallel case those arguments are
791 # going to be pickled and unpickled, and I'm not certain
792 # multiprocessing is careful enough with that for output arguments to
793 # work.
795 # Expand the data IDs to include all dimension metadata; we need this
796 # because we may need to generate path templates that rely on that
797 # metadata.
798 # This is the first step that involves actual database calls (but just
799 # SELECTs), so if there's going to be a problem with connections vs.
800 # multiple processes, or lock contention (in SQLite) slowing things
801 # down, it'll happen here.
802 return mapFunc(self.expandDataIds, exposureData), bad_files
804 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
805 ) -> List[FileDataset]:
806 """Ingest all raw files in one exposure.
808 Parameters
809 ----------
810 exposure : `RawExposureData`
811 A structure containing information about the exposure to be
812 ingested. Must have `RawExposureData.records` populated and all
813 data ID attributes expanded.
814 run : `str`, optional
815 Name of a RUN-type collection to write to, overriding
816 ``self.butler.run``.
818 Returns
819 -------
820 datasets : `list` of `lsst.daf.butler.FileDataset`
821 Per-file structures identifying the files ingested and their
822 dataset representation in the data repository.
823 """
824 datasets = [FileDataset(path=file.filename.abspath(),
825 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
826 formatter=file.FormatterClass)
827 for file in exposure.files]
828 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
829 return datasets
831 def ingestFiles(self, files, *, pool: Optional[Pool] = None, processes: int = 1,
832 run: Optional[str] = None):
833 """Ingest files into a Butler data repository.
835 This creates any new exposure or visit Dimension entries needed to
836 identify the ingested files, creates new Dataset entries in the
837 Registry and finally ingests the files themselves into the Datastore.
838 Any needed instrument, detector, and physical_filter Dimension entries
839 must exist in the Registry before `run` is called.
841 Parameters
842 ----------
843 files : iterable over `ButlerURI`
844 URIs to the files to be ingested.
845 pool : `multiprocessing.Pool`, optional
846 If not `None`, a process pool with which to parallelize some
847 operations.
848 processes : `int`, optional
849 The number of processes to use. Ignored if ``pool`` is not `None`.
850 run : `str`, optional
851 Name of a RUN-type collection to write to, overriding
852 the default derived from the instrument name.
854 Returns
855 -------
856 refs : `list` of `lsst.daf.butler.DatasetRef`
857 Dataset references for ingested raws.
858 """
860 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
862 # Up to this point, we haven't modified the data repository at all.
863 # Now we finally do that, with one transaction per exposure. This is
864 # not parallelized at present because the performance of this step is
865 # limited by the database server. That may or may not change in the
866 # future once we increase our usage of bulk inserts and reduce our
867 # usage of savepoints; we've tried to get everything but the database
868 # operations done in advance to reduce the time spent inside
869 # transactions.
870 self.butler.registry.registerDatasetType(self.datasetType)
872 refs = []
873 runs = set()
874 n_exposures = 0
875 n_exposures_failed = 0
876 n_ingests_failed = 0
877 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
879 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s",
880 *_log_msg_counter(exposure.files),
881 exposure.record.instrument, exposure.record.obs_id)
883 try:
884 self.butler.registry.syncDimensionData("exposure", exposure.record)
885 except Exception as e:
886 self._on_ingest_failure(exposure, e)
887 n_exposures_failed += 1
888 self.log.warning("Exposure %s:%s could not be registered: %s",
889 exposure.record.instrument, exposure.record.obs_id, e)
890 if self.config.failFast:
891 raise e
892 continue
894 # Override default run if nothing specified explicitly.
895 if run is None:
896 instrumentClass = exposure.files[0].instrumentClass
897 this_run = instrumentClass.makeDefaultRawIngestRunName()
898 else:
899 this_run = run
900 if this_run not in runs:
901 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
902 runs.add(this_run)
903 try:
904 with self.butler.transaction():
905 datasets_for_exposure = self.ingestExposureDatasets(exposure, run=this_run)
906 except Exception as e:
907 self._on_ingest_failure(exposure, e)
908 n_ingests_failed += 1
909 self.log.warning("Failed to ingest the following for reason: %s", e)
910 for f in exposure.files:
911 self.log.warning("- %s", f.filename)
912 if self.config.failFast:
913 raise e
914 continue
915 else:
916 self._on_success(datasets_for_exposure)
917 for dataset in datasets_for_exposure:
918 refs.extend(dataset.refs)
920 # Success for this exposure.
921 n_exposures += 1
922 self.log.info("Exposure %s:%s ingested successfully",
923 exposure.record.instrument, exposure.record.obs_id)
925 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
927 @timeMethod
928 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None,
929 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", group_files: bool = True):
930 """Ingest files into a Butler data repository.
932 This creates any new exposure or visit Dimension entries needed to
933 identify the ingested files, creates new Dataset entries in the
934 Registry and finally ingests the files themselves into the Datastore.
935 Any needed instrument, detector, and physical_filter Dimension entries
936 must exist in the Registry before `run` is called.
938 Parameters
939 ----------
940 files : iterable over `ButlerURI`, `str` or path-like objects
941 Paths to the files to be ingested. Can refer to directories.
942 Will be made absolute if they are not already.
943 pool : `multiprocessing.Pool`, optional
944 If not `None`, a process pool with which to parallelize some
945 operations.
946 processes : `int`, optional
947 The number of processes to use. Ignored if ``pool`` is not `None`.
948 run : `str`, optional
949 Name of a RUN-type collection to write to, overriding
950 the default derived from the instrument name.
951 file_filter : `str` or `re.Pattern`, optional
952 Pattern to use to discover files to ingest within directories.
953 The default is to search for FITS files. The regex applies to
954 files within the directory.
955 group_files : `bool`, optional
956 Group files by directory if they have been discovered in
957 directories. Will not affect files explicitly provided.
959 Returns
960 -------
961 refs : `list` of `lsst.daf.butler.DatasetRef`
962 Dataset references for ingested raws.
964 Notes
965 -----
966 This method inserts all datasets for an exposure within a transaction,
967 guaranteeing that partial exposures are never ingested. The exposure
968 dimension record is inserted with `Registry.syncDimensionData` first
969 (in its own transaction), which inserts only if a record with the same
970 primary key does not already exist. This allows different files within
971 the same exposure to be incremented in different runs.
972 """
974 refs = []
975 bad_files = []
976 n_exposures = 0
977 n_exposures_failed = 0
978 n_ingests_failed = 0
979 if group_files:
980 for group in ButlerURI.findFileResources(files, file_filter, group_files):
981 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(group, pool=pool,
982 processes=processes,
983 run=run)
984 refs.extend(new_refs)
985 bad_files.extend(bad)
986 n_exposures += n_exp
987 n_exposures_failed += n_exp_fail
988 n_ingests_failed += n_ingest_fail
989 else:
990 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
991 ButlerURI.findFileResources(files, file_filter, group_files),
992 pool=pool,
993 processes=processes,
994 run=run,
995 )
997 had_failure = False
999 if bad_files:
1000 had_failure = True
1001 self.log.warning("Could not extract observation metadata from the following:")
1002 for f in bad_files:
1003 self.log.warning("- %s", f)
1005 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure"
1006 " registration and %d failure%s from file ingest.",
1007 *_log_msg_counter(n_exposures),
1008 *_log_msg_counter(n_exposures_failed),
1009 *_log_msg_counter(n_ingests_failed))
1010 if n_exposures_failed > 0 or n_ingests_failed > 0:
1011 had_failure = True
1012 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1014 if had_failure:
1015 raise RuntimeError("Some failures encountered during ingestion")
1017 return refs