Coverage for python/lsst/obs/base/ingest.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from dataclasses import dataclass, InitVar
28from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union
29from collections import defaultdict
30from multiprocessing import Pool
32from astro_metadata_translator import ObservationInfo, merge_headers
33from astro_metadata_translator.indexing import process_sidecar_data, process_index_data
34from lsst.afw.fits import readMetadata
35from lsst.daf.butler import (
36 Butler,
37 ButlerURI,
38 CollectionType,
39 DataCoordinate,
40 DatasetRef,
41 DatasetType,
42 DimensionRecord,
43 DimensionUniverse,
44 FileDataset,
45 Formatter,
46)
47from lsst.pex.config import Config, ChoiceField, Field
48from lsst.pipe.base import Task, timeMethod
50from ._instrument import Instrument, makeExposureRecordFromObsInfo
51from ._fitsRawFormatterBase import FitsRawFormatterBase
54def _do_nothing(*args, **kwargs) -> None:
55 """Do nothing.
57 This is a function that accepts anything and does nothing.
58 For use as a default in callback arguments.
59 """
60 pass
63def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]:
64 """Count the iterable and return the count and plural modifier.
66 Parameters
67 ----------
68 noun : Iterable or `int`
69 Thing to count. If given an integer it is assumed to be the count
70 to use to calculate modifier.
72 Returns
73 -------
74 num : `int`
75 Number of items found in ``noun``.
76 modifier : `str`
77 Character to add to the end of a string referring to these items
78 to indicate whether it was a single item or not. Returns empty
79 string if there is one item or "s" otherwise.
81 Examples
82 --------
84 .. code-block:: python
86 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
87 """
88 if isinstance(noun, int):
89 num = noun
90 else:
91 num = len(noun)
92 return num, "" if num == 1 else "s"
95@dataclass
96class RawFileDatasetInfo:
97 """Information about a single dataset within a raw file."""
99 dataId: DataCoordinate
100 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
102 obsInfo: ObservationInfo
103 """Standardized observation metadata extracted directly from the file
104 headers (`astro_metadata_translator.ObservationInfo`).
105 """
108@dataclass
109class RawFileData:
110 """Information about a single raw file, used during ingest."""
112 datasets: List[RawFileDatasetInfo]
113 """The information describing each dataset within this raw file.
114 (`list` of `RawFileDatasetInfo`)
115 """
117 filename: ButlerURI
118 """URI of the file this information was extracted from (`str`).
120 This is the path prior to ingest, not the path after ingest.
121 """
123 FormatterClass: Type[FitsRawFormatterBase]
124 """Formatter class that should be used to ingest this file (`type`; as
125 subclass of `FitsRawFormatterBase`).
126 """
128 instrumentClass: Optional[Type[Instrument]]
129 """The `Instrument` class associated with this file. Can be `None`
130 if ``datasets`` is an empty list."""
133@dataclass
134class RawExposureData:
135 """Information about a complete raw exposure, used during ingest."""
137 dataId: DataCoordinate
138 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
139 """
141 files: List[RawFileData]
142 """List of structures containing file-level information.
143 """
145 universe: InitVar[DimensionUniverse]
146 """Set of all known dimensions.
147 """
149 record: Optional[DimensionRecord] = None
150 """The exposure `DimensionRecord` that must be inserted into the
151 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
152 """
154 def __post_init__(self, universe: DimensionUniverse):
155 # We don't care which file or dataset we read metadata from, because
156 # we're assuming they'll all be the same; just use the first ones.
157 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
160def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
161 """Create a Config field with options for transferring data between repos.
163 The allowed options for the field are exactly those supported by
164 `lsst.daf.butler.Datastore.ingest`.
166 Parameters
167 ----------
168 doc : `str`
169 Documentation for the configuration field.
171 Returns
172 -------
173 field : `lsst.pex.config.ChoiceField`
174 Configuration field.
175 """
176 return ChoiceField(
177 doc=doc,
178 dtype=str,
179 allowed={"move": "move",
180 "copy": "copy",
181 "auto": "choice will depend on datastore",
182 "direct": "use URI to ingested file directly in datastore",
183 "link": "hard link falling back to symbolic link",
184 "hardlink": "hard link",
185 "symlink": "symbolic (soft) link",
186 "relsymlink": "relative symbolic link",
187 },
188 optional=True,
189 default=default
190 )
193class RawIngestConfig(Config):
194 """Configuration class for RawIngestTask."""
196 transfer = makeTransferChoiceField()
197 failFast = Field(
198 dtype=bool,
199 default=False,
200 doc="If True, stop ingest as soon as any problem is encountered with any file. "
201 "Otherwise problems files will be skipped and logged and a report issued at completion.",
202 )
205class RawIngestTask(Task):
206 """Driver Task for ingesting raw data into Gen3 Butler repositories.
208 Parameters
209 ----------
210 config : `RawIngestConfig`
211 Configuration for the task.
212 butler : `~lsst.daf.butler.Butler`
213 Writeable butler instance, with ``butler.run`` set to the appropriate
214 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
215 datasets.
216 on_success : `Callable`, optional
217 A callback invoked when all of the raws associated with an exposure
218 are ingested. Will be passed a list of `FileDataset` objects, each
219 containing one or more resolved `DatasetRef` objects. If this callback
220 raises it will interrupt the entire ingest process, even if
221 `RawIngestConfig.failFast` is `False`.
222 on_metadata_failure : `Callable`, optional
223 A callback invoked when a failure occurs trying to translate the
224 metadata for a file. Will be passed the URI and the exception, in
225 that order, as positional arguments. Guaranteed to be called in an
226 ``except`` block, allowing the callback to re-raise or replace (with
227 ``raise ... from``) to override the task's usual error handling (before
228 `RawIngestConfig.failFast` logic occurs).
229 on_ingest_failure : `Callable`, optional
230 A callback invoked when dimension record or dataset insertion into the
231 database fails for an exposure. Will be passed a `RawExposureData`
232 instance and the exception, in that order, as positional arguments.
233 Guaranteed to be called in an ``except`` block, allowing the callback
234 to re-raise or replace (with ``raise ... from``) to override the task's
235 usual error handling (before `RawIngestConfig.failFast` logic occurs).
236 **kwargs
237 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
238 constructor.
240 Notes
241 -----
242 Each instance of `RawIngestTask` writes to the same Butler. Each
243 invocation of `RawIngestTask.run` ingests a list of files.
244 """
246 ConfigClass = RawIngestConfig
248 _DefaultName = "ingest"
250 def getDatasetType(self):
251 """Return the DatasetType of the datasets ingested by this Task."""
252 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
253 universe=self.butler.registry.dimensions)
255 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler,
256 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
257 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing,
258 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
259 **kwargs: Any):
260 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
261 super().__init__(config, **kwargs)
262 self.butler = butler
263 self.universe = self.butler.registry.dimensions
264 self.datasetType = self.getDatasetType()
265 self._on_success = on_success
266 self._on_metadata_failure = on_metadata_failure
267 self._on_ingest_failure = on_ingest_failure
269 # Import all the instrument classes so that we ensure that we
270 # have all the relevant metadata translators loaded.
271 Instrument.importAll(self.butler.registry)
273 def _reduce_kwargs(self):
274 # Add extra parameters to pickle.
275 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success,
276 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure)
278 def _determine_instrument_formatter(self, dataId, filename):
279 """Determine the instrument and formatter class.
281 Parameters
282 ----------
283 dataId : `lsst.daf.butler.DataCoordinate`
284 The dataId associated with this dataset.
285 filename : `ButlerURI`
286 URI of file used for error reporting.
288 Returns
289 -------
290 instrument : `Instrument` or `None`
291 Instance of the `Instrument` associated with this dataset. `None`
292 indicates that the instrument could not be determined.
293 formatterClass : `type`
294 Class to be used as the formatter for this dataset.
295 """
296 # The data model currently assumes that whilst multiple datasets
297 # can be associated with a single file, they must all share the
298 # same formatter.
299 try:
300 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry)
301 except LookupError as e:
302 self._on_metadata_failure(filename, e)
303 self.log.warning("Instrument %s for file %s not known to registry",
304 dataId["instrument"], filename)
305 if self.config.failFast:
306 raise RuntimeError(f"Instrument {dataId['instrument']} for"
307 f" file {filename} not known to registry") from e
308 FormatterClass = Formatter
309 # Indicate that we could not work out the instrument.
310 instrument = None
311 else:
312 FormatterClass = instrument.getRawFormatter(dataId)
313 return instrument, FormatterClass
315 def extractMetadata(self, filename: ButlerURI) -> RawFileData:
316 """Extract and process metadata from a single raw file.
318 Parameters
319 ----------
320 filename : `ButlerURI`
321 URI to the file.
323 Returns
324 -------
325 data : `RawFileData`
326 A structure containing the metadata extracted from the file,
327 as well as the original filename. All fields will be populated,
328 but the `RawFileData.dataId` attribute will be a minimal
329 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
330 ``instrumentClass`` field will be `None` if there is a problem
331 with metadata extraction.
333 Notes
334 -----
335 Assumes that there is a single dataset associated with the given
336 file. Instruments using a single file to store multiple datasets
337 must implement their own version of this method.
339 By default the method will catch all exceptions unless the ``failFast``
340 configuration item is `True`. If an error is encountered the
341 `_on_metadata_failure()` method will be called. If no exceptions
342 result and an error was encountered the returned object will have
343 a null-instrument class and no datasets.
345 This method supports sidecar JSON files which can be used to
346 extract metadata without having to read the data file itself.
347 The sidecar file is always used if found.
348 """
349 sidecar_fail_msg = "" # Requires prepended space when set.
350 try:
351 sidecar_file = filename.updatedExtension(".json")
352 if sidecar_file.exists():
353 content = json.loads(sidecar_file.read())
354 header = process_sidecar_data(content)
355 sidecar_fail_msg = " (via sidecar)"
356 else:
357 # Read the metadata from the data file itself.
358 # Manually merge the primary and "first data" headers here
359 # because we do not know in general if an input file has
360 # set INHERIT=T.
361 # For remote files download the entire file to get the
362 # header. This is very inefficient and it would be better
363 # to have some way of knowing where in the file the headers
364 # are and to only download those parts of the file.
365 with filename.as_local() as local_file:
366 phdu = readMetadata(local_file.ospath, 0)
367 header = merge_headers([phdu, readMetadata(local_file.ospath)], mode="overwrite")
368 datasets = [self._calculate_dataset_info(header, filename)]
369 except Exception as e:
370 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
371 # Indicate to the caller that we failed to read.
372 datasets = []
373 formatterClass = Formatter
374 instrument = None
375 self._on_metadata_failure(filename, e)
376 if self.config.failFast:
377 raise RuntimeError("Problem extracting metadata for file "
378 f"{filename}{sidecar_fail_msg}") from e
379 else:
380 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
381 # The data model currently assumes that whilst multiple datasets
382 # can be associated with a single file, they must all share the
383 # same formatter.
384 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
385 if instrument is None:
386 datasets = []
388 return RawFileData(datasets=datasets, filename=filename,
389 FormatterClass=formatterClass,
390 instrumentClass=instrument)
392 def _calculate_dataset_info(self, header, filename):
393 """Calculate a RawFileDatasetInfo from the supplied information.
395 Parameters
396 ----------
397 header : Mapping or `astro_metadata_translator.ObservationInfo`
398 Header from the dataset or previously-translated content.
399 filename : `ButlerURI`
400 Filename to use for error messages.
402 Returns
403 -------
404 dataset : `RawFileDatasetInfo`
405 The dataId, and observation information associated with this
406 dataset.
407 """
408 # To ensure we aren't slowed down for no reason, explicitly
409 # list here the properties we need for the schema.
410 # Use a dict with values a boolean where True indicates
411 # that it is required that we calculate this property.
412 ingest_subset = {
413 "altaz_begin": False,
414 "boresight_rotation_coord": False,
415 "boresight_rotation_angle": False,
416 "dark_time": False,
417 "datetime_begin": True,
418 "datetime_end": True,
419 "detector_num": True,
420 "exposure_group": False,
421 "exposure_id": True,
422 "exposure_time": True,
423 "instrument": True,
424 "tracking_radec": False,
425 "object": False,
426 "observation_counter": False,
427 "observation_id": True,
428 "observation_reason": False,
429 "observation_type": True,
430 "observing_day": False,
431 "physical_filter": True,
432 "science_program": False,
433 "visit_id": False,
434 }
436 if isinstance(header, ObservationInfo):
437 obsInfo = header
438 missing = []
439 # Need to check the required properties are present.
440 for property, required in ingest_subset.items():
441 if not required:
442 continue
443 # getattr does not need to be protected because it is using
444 # the defined list above containing properties that must exist.
445 value = getattr(obsInfo, property)
446 if value is None:
447 missing.append(property)
448 if missing:
449 raise ValueError(f"Requested required properties are missing from file {filename}:"
450 f" {missing} (via JSON)")
452 else:
453 obsInfo = ObservationInfo(header, pedantic=False, filename=str(filename),
454 required={k for k in ingest_subset if ingest_subset[k]},
455 subset=set(ingest_subset))
457 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
458 exposure=obsInfo.exposure_id,
459 detector=obsInfo.detector_num,
460 universe=self.universe)
461 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
463 def locateAndReadIndexFiles(self, files):
464 """Given a list of files, look for index files and read them.
466 Index files can either be explicitly in the list of files to
467 ingest, or else located in the same directory as a file to ingest.
468 Index entries are always used if present.
470 Parameters
471 ----------
472 files : iterable over `ButlerURI`
473 URIs to the files to be ingested.
475 Returns
476 -------
477 index : `dict` [`str`, Any]
478 Merged contents of all relevant index files found. These can
479 be explicitly specified index files or ones found in the
480 directory alongside a data file to be ingested.
481 updated_files : iterable of `str`
482 Updated list of the input files with entries removed that were
483 found listed in an index file. Order is not guaranteed to
484 match the order of the files given to this routine.
485 bad_index_files: `set[str]`
486 Files that looked like index files but failed to read properly.
487 """
488 # Convert the paths to absolute for easy comparison with index content.
489 # Do not convert to real paths since we have to assume that index
490 # files are in this location and not the location which it links to.
491 files = tuple(f.abspath() for f in files)
493 # Index files must be named this.
494 index_root_file = "_index.json"
496 # Group the files by directory.
497 files_by_directory = defaultdict(set)
499 for path in files:
500 directory, file_in_dir = path.split()
501 files_by_directory[directory].add(file_in_dir)
503 # All the metadata read from index files with keys of full path.
504 index_entries = {}
506 # Index files we failed to read.
507 bad_index_files = set()
509 # Any good index files that were found and used.
510 good_index_files = set()
512 # Look for index files in those directories.
513 for directory, files_in_directory in files_by_directory.items():
514 possible_index_file = directory.join(index_root_file)
515 if possible_index_file.exists():
516 # If we are explicitly requesting an index file the
517 # messages should be different.
518 index_msg = "inferred"
519 is_implied = True
520 if index_root_file in files_in_directory:
521 index_msg = "explicit"
522 is_implied = False
524 # Try to read the index file and catch and report any
525 # problems.
526 try:
527 content = json.loads(possible_index_file.read())
528 index = process_index_data(content, force_dict=True)
529 except Exception as e:
530 # Only trigger the callback if the index file
531 # was asked for explicitly. Triggering on implied file
532 # might be surprising.
533 if not is_implied:
534 self._on_metadata_failure(possible_index_file, e)
535 if self.config.failFast:
536 raise RuntimeError(f"Problem reading index file from {index_msg} "
537 f"location {possible_index_file}") from e
538 bad_index_files.add(possible_index_file)
539 continue
541 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
542 good_index_files.add(possible_index_file)
544 # Go through the index adding entries for files.
545 # If we have non-index files in this directory marked for
546 # ingest we should only get index information for those.
547 # If the index file was explicit we use all entries.
548 if is_implied:
549 files_to_ingest = files_in_directory
550 else:
551 files_to_ingest = set(index)
553 # Copy relevant metadata into a single dict for all index
554 # entries.
555 for file_in_dir in files_to_ingest:
556 # Skip an explicitly specified index file.
557 # This should never happen because an explicit index
558 # file will force ingest of all files in the index
559 # and not use the explicit file list. If somehow
560 # this is not true we continue. Raising an exception
561 # seems like the wrong thing to do since this is harmless.
562 if file_in_dir == index_root_file:
563 self.log.info("Logic error found scanning directory %s. Please file ticket.",
564 directory)
565 continue
566 if file_in_dir in index:
567 file = directory.join(file_in_dir)
568 if file in index_entries:
569 # ObservationInfo overrides raw metadata
570 if isinstance(index[file_in_dir], ObservationInfo) \
571 and not isinstance(index_entries[file], ObservationInfo):
572 self.log.warning("File %s already specified in an index file but overriding"
573 " with ObservationInfo content from %s",
574 file, possible_index_file)
575 else:
576 self.log.warning("File %s already specified in an index file, "
577 "ignoring content from %s", file, possible_index_file)
578 # Do nothing in this case
579 continue
581 index_entries[file] = index[file_in_dir]
583 # Remove files from list that have index entries and also
584 # any files that we determined to be explicit index files
585 # or any index files that we failed to read.
586 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
588 # The filtered list loses the initial order. Retaining the order
589 # is good for testing but does have a cost if there are many
590 # files when copying the good values out. A dict would have faster
591 # lookups (using the files as keys) but use more memory.
592 ordered = [f for f in filtered if f in files]
594 return index_entries, ordered, good_index_files, bad_index_files
596 def processIndexEntries(self, index_entries):
597 """Convert index entries to RawFileData.
599 Parameters
600 ----------
601 index_entries : `dict` [`str`, Any]
602 Dict indexed by name of file to ingest and with keys either
603 raw metadata or translated
604 `~astro_metadata_translator.ObservationInfo`.
606 Returns
607 -------
608 data : `RawFileData`
609 A structure containing the metadata extracted from the file,
610 as well as the original filename. All fields will be populated,
611 but the `RawFileData.dataId` attribute will be a minimal
612 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance.
613 """
614 fileData = []
615 for filename, metadata in index_entries.items():
616 try:
617 datasets = [self._calculate_dataset_info(metadata, filename)]
618 except Exception as e:
619 self.log.debug("Problem extracting metadata for file %s found in index file: %s",
620 filename, e)
621 datasets = []
622 formatterClass = Formatter
623 instrument = None
624 self._on_metadata_failure(filename, e)
625 if self.config.failFast:
626 raise RuntimeError(f"Problem extracting metadata for file {filename} "
627 "found in index file") from e
628 else:
629 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId,
630 filename)
631 if instrument is None:
632 datasets = []
633 fileData.append(RawFileData(datasets=datasets, filename=filename,
634 FormatterClass=formatterClass, instrumentClass=instrument))
635 return fileData
637 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
638 """Group an iterable of `RawFileData` by exposure.
640 Parameters
641 ----------
642 files : iterable of `RawFileData`
643 File-level information to group.
645 Returns
646 -------
647 exposures : `list` of `RawExposureData`
648 A list of structures that group the file-level information by
649 exposure. All fields will be populated. The
650 `RawExposureData.dataId` attributes will be minimal (unexpanded)
651 `~lsst.daf.butler.DataCoordinate` instances.
652 """
653 exposureDimensions = self.universe["exposure"].graph
654 byExposure = defaultdict(list)
655 for f in files:
656 # Assume that the first dataset is representative for the file.
657 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
659 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
660 for dataId, exposureFiles in byExposure.items()]
662 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
663 """Expand the data IDs associated with a raw exposure.
665 This adds the metadata records.
667 Parameters
668 ----------
669 exposure : `RawExposureData`
670 A structure containing information about the exposure to be
671 ingested. Must have `RawExposureData.records` populated. Should
672 be considered consumed upon return.
674 Returns
675 -------
676 exposure : `RawExposureData`
677 An updated version of the input structure, with
678 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
679 updated to data IDs for which
680 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
681 """
682 # We start by expanded the exposure-level data ID; we won't use that
683 # directly in file ingest, but this lets us do some database lookups
684 # once per exposure instead of once per file later.
685 data.dataId = self.butler.registry.expandDataId(
686 data.dataId,
687 # We pass in the records we'll be inserting shortly so they aren't
688 # looked up from the database. We do expect instrument and filter
689 # records to be retrieved from the database here (though the
690 # Registry may cache them so there isn't a lookup every time).
691 records={
692 self.butler.registry.dimensions["exposure"]: data.record,
693 }
694 )
695 # Now we expand the per-file (exposure+detector) data IDs. This time
696 # we pass in the records we just retrieved from the exposure data ID
697 # expansion.
698 for file in data.files:
699 for dataset in file.datasets:
700 dataset.dataId = self.butler.registry.expandDataId(
701 dataset.dataId,
702 records=dict(data.dataId.records)
703 )
704 return data
706 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1
707 ) -> Tuple[Iterator[RawExposureData], List[str]]:
708 """Perform all non-database-updating ingest preprocessing steps.
710 Parameters
711 ----------
712 files : iterable over `str` or path-like objects
713 Paths to the files to be ingested. Will be made absolute
714 if they are not already.
715 pool : `multiprocessing.Pool`, optional
716 If not `None`, a process pool with which to parallelize some
717 operations.
718 processes : `int`, optional
719 The number of processes to use. Ignored if ``pool`` is not `None`.
721 Returns
722 -------
723 exposures : `Iterator` [ `RawExposureData` ]
724 Data structures containing dimension records, filenames, and data
725 IDs to be ingested (one structure for each exposure).
726 bad_files : `list` of `str`
727 List of all the files that could not have metadata extracted.
728 """
729 if pool is None and processes > 1:
730 pool = Pool(processes)
731 mapFunc = map if pool is None else pool.imap_unordered
733 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]:
734 """Filter out bad files and return good with list of bad."""
735 good_files = []
736 bad_files = []
737 for fileDatum in file_data:
738 if not fileDatum.datasets:
739 bad_files.append(fileDatum.filename)
740 else:
741 good_files.append(fileDatum)
742 return good_files, bad_files
744 # Look for index files and read them.
745 # There should be far fewer index files than data files.
746 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
747 if bad_index_files:
748 self.log.info("Failed to read the following explicitly requested index files:"),
749 for bad in sorted(bad_index_files):
750 self.log.info("- %s", bad)
752 # Now convert all the index file entries to standard form for ingest.
753 bad_index_file_data = []
754 indexFileData = self.processIndexEntries(index_entries)
755 if indexFileData:
756 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData)
757 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s"
758 " with %d failure%s",
759 *_log_msg_counter(indexFileData),
760 *_log_msg_counter(good_index_files),
761 *_log_msg_counter(bad_index_file_data))
763 # Extract metadata and build per-detector regions.
764 # This could run in a subprocess so collect all output
765 # before looking at failures.
766 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
768 # Filter out all the failed reads and store them for later
769 # reporting.
770 fileData, bad_files = _partition_good_bad(fileData)
771 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s",
772 *_log_msg_counter(fileData),
773 *_log_msg_counter(bad_files))
775 # Combine with data from index files.
776 fileData.extend(indexFileData)
777 bad_files.extend(bad_index_file_data)
778 bad_files.extend(bad_index_files)
780 # Use that metadata to group files (and extracted metadata) by
781 # exposure. Never parallelized because it's intrinsically a gather
782 # step.
783 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
785 # The next operation operates on RawExposureData instances (one at
786 # a time) in-place and then returns the modified instance. We call it
787 # as a pass-through instead of relying on the arguments we pass in to
788 # have been modified because in the parallel case those arguments are
789 # going to be pickled and unpickled, and I'm not certain
790 # multiprocessing is careful enough with that for output arguments to
791 # work.
793 # Expand the data IDs to include all dimension metadata; we need this
794 # because we may need to generate path templates that rely on that
795 # metadata.
796 # This is the first step that involves actual database calls (but just
797 # SELECTs), so if there's going to be a problem with connections vs.
798 # multiple processes, or lock contention (in SQLite) slowing things
799 # down, it'll happen here.
800 return mapFunc(self.expandDataIds, exposureData), bad_files
802 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
803 ) -> List[FileDataset]:
804 """Ingest all raw files in one exposure.
806 Parameters
807 ----------
808 exposure : `RawExposureData`
809 A structure containing information about the exposure to be
810 ingested. Must have `RawExposureData.records` populated and all
811 data ID attributes expanded.
812 run : `str`, optional
813 Name of a RUN-type collection to write to, overriding
814 ``self.butler.run``.
816 Returns
817 -------
818 datasets : `list` of `lsst.daf.butler.FileDataset`
819 Per-file structures identifying the files ingested and their
820 dataset representation in the data repository.
821 """
822 datasets = [FileDataset(path=file.filename.abspath(),
823 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
824 formatter=file.FormatterClass)
825 for file in exposure.files]
826 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
827 return datasets
829 def ingestFiles(self, files, *, pool: Optional[Pool] = None, processes: int = 1,
830 run: Optional[str] = None):
831 """Ingest files into a Butler data repository.
833 This creates any new exposure or visit Dimension entries needed to
834 identify the ingested files, creates new Dataset entries in the
835 Registry and finally ingests the files themselves into the Datastore.
836 Any needed instrument, detector, and physical_filter Dimension entries
837 must exist in the Registry before `run` is called.
839 Parameters
840 ----------
841 files : iterable over `ButlerURI`
842 URIs to the files to be ingested.
843 pool : `multiprocessing.Pool`, optional
844 If not `None`, a process pool with which to parallelize some
845 operations.
846 processes : `int`, optional
847 The number of processes to use. Ignored if ``pool`` is not `None`.
848 run : `str`, optional
849 Name of a RUN-type collection to write to, overriding
850 the default derived from the instrument name.
852 Returns
853 -------
854 refs : `list` of `lsst.daf.butler.DatasetRef`
855 Dataset references for ingested raws.
856 """
858 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
860 # Up to this point, we haven't modified the data repository at all.
861 # Now we finally do that, with one transaction per exposure. This is
862 # not parallelized at present because the performance of this step is
863 # limited by the database server. That may or may not change in the
864 # future once we increase our usage of bulk inserts and reduce our
865 # usage of savepoints; we've tried to get everything but the database
866 # operations done in advance to reduce the time spent inside
867 # transactions.
868 self.butler.registry.registerDatasetType(self.datasetType)
870 refs = []
871 runs = set()
872 n_exposures = 0
873 n_exposures_failed = 0
874 n_ingests_failed = 0
875 for exposure in exposureData:
877 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s",
878 *_log_msg_counter(exposure.files),
879 exposure.record.instrument, exposure.record.obs_id)
881 try:
882 self.butler.registry.syncDimensionData("exposure", exposure.record)
883 except Exception as e:
884 self._on_ingest_failure(exposure, e)
885 n_exposures_failed += 1
886 self.log.warning("Exposure %s:%s could not be registered: %s",
887 exposure.record.instrument, exposure.record.obs_id, e)
888 if self.config.failFast:
889 raise e
890 continue
892 # Override default run if nothing specified explicitly.
893 if run is None:
894 instrumentClass = exposure.files[0].instrumentClass
895 this_run = instrumentClass.makeDefaultRawIngestRunName()
896 else:
897 this_run = run
898 if this_run not in runs:
899 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
900 runs.add(this_run)
901 try:
902 with self.butler.transaction():
903 datasets_for_exposure = self.ingestExposureDatasets(exposure, run=this_run)
904 except Exception as e:
905 self._on_ingest_failure(exposure, e)
906 n_ingests_failed += 1
907 self.log.warning("Failed to ingest the following for reason: %s", e)
908 for f in exposure.files:
909 self.log.warning("- %s", f.filename)
910 if self.config.failFast:
911 raise e
912 continue
913 else:
914 self._on_success(datasets_for_exposure)
915 for dataset in datasets_for_exposure:
916 refs.extend(dataset.refs)
918 # Success for this exposure.
919 n_exposures += 1
920 self.log.info("Exposure %s:%s ingested successfully",
921 exposure.record.instrument, exposure.record.obs_id)
923 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
925 @timeMethod
926 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None,
927 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", group_files: bool = True):
928 """Ingest files into a Butler data repository.
930 This creates any new exposure or visit Dimension entries needed to
931 identify the ingested files, creates new Dataset entries in the
932 Registry and finally ingests the files themselves into the Datastore.
933 Any needed instrument, detector, and physical_filter Dimension entries
934 must exist in the Registry before `run` is called.
936 Parameters
937 ----------
938 files : iterable over `ButlerURI`, `str` or path-like objects
939 Paths to the files to be ingested. Can refer to directories.
940 Will be made absolute if they are not already.
941 pool : `multiprocessing.Pool`, optional
942 If not `None`, a process pool with which to parallelize some
943 operations.
944 processes : `int`, optional
945 The number of processes to use. Ignored if ``pool`` is not `None`.
946 run : `str`, optional
947 Name of a RUN-type collection to write to, overriding
948 the default derived from the instrument name.
949 file_filter : `str` or `re.Pattern`, optional
950 Pattern to use to discover files to ingest within directories.
951 The default is to search for FITS files. The regex applies to
952 files within the directory.
953 group_files : `bool`, optional
954 Group files by directory if they have been discovered in
955 directories. Will not affect files explicitly provided.
957 Returns
958 -------
959 refs : `list` of `lsst.daf.butler.DatasetRef`
960 Dataset references for ingested raws.
962 Notes
963 -----
964 This method inserts all datasets for an exposure within a transaction,
965 guaranteeing that partial exposures are never ingested. The exposure
966 dimension record is inserted with `Registry.syncDimensionData` first
967 (in its own transaction), which inserts only if a record with the same
968 primary key does not already exist. This allows different files within
969 the same exposure to be incremented in different runs.
970 """
972 refs = []
973 bad_files = []
974 n_exposures = 0
975 n_exposures_failed = 0
976 n_ingests_failed = 0
977 if group_files:
978 for group in ButlerURI.findFileResources(files, file_filter, group_files):
979 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(group, pool=pool,
980 processes=processes,
981 run=run)
982 refs.extend(new_refs)
983 bad_files.extend(bad)
984 n_exposures += n_exp
985 n_exposures_failed += n_exp_fail
986 n_ingests_failed += n_ingest_fail
987 else:
988 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
989 ButlerURI.findFileResources(files, file_filter, group_files),
990 pool=pool,
991 processes=processes,
992 run=run,
993 )
995 had_failure = False
997 if bad_files:
998 had_failure = True
999 self.log.warning("Could not extract observation metadata from the following:")
1000 for f in bad_files:
1001 self.log.warning("- %s", f)
1003 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure"
1004 " registration and %d failure%s from file ingest.",
1005 *_log_msg_counter(n_exposures),
1006 *_log_msg_counter(n_exposures_failed),
1007 *_log_msg_counter(n_ingests_failed))
1008 if n_exposures_failed > 0 or n_ingests_failed > 0:
1009 had_failure = True
1010 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1012 if had_failure:
1013 raise RuntimeError("Some failures encountered during ingestion")
1015 return refs