Coverage for python/lsst/obs/base/ingest.py : 15%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from dataclasses import dataclass, InitVar
28from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union
29from collections import defaultdict
30from multiprocessing import Pool
32from astro_metadata_translator import ObservationInfo, merge_headers, MetadataTranslator
33from astro_metadata_translator.indexing import process_sidecar_data, process_index_data
34from lsst.afw.fits import readMetadata
35from lsst.daf.butler import (
36 Butler,
37 ButlerURI,
38 CollectionType,
39 DataCoordinate,
40 DatasetRef,
41 DatasetType,
42 DimensionRecord,
43 DimensionUniverse,
44 FileDataset,
45 Formatter,
46 Progress,
47)
48from lsst.pex.config import Config, ChoiceField, Field
49from lsst.pipe.base import Task, timeMethod
51from ._instrument import Instrument, makeExposureRecordFromObsInfo
52from ._fitsRawFormatterBase import FitsRawFormatterBase
55def _do_nothing(*args, **kwargs) -> None:
56 """Do nothing.
58 This is a function that accepts anything and does nothing.
59 For use as a default in callback arguments.
60 """
61 pass
64def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]:
65 """Count the iterable and return the count and plural modifier.
67 Parameters
68 ----------
69 noun : Iterable or `int`
70 Thing to count. If given an integer it is assumed to be the count
71 to use to calculate modifier.
73 Returns
74 -------
75 num : `int`
76 Number of items found in ``noun``.
77 modifier : `str`
78 Character to add to the end of a string referring to these items
79 to indicate whether it was a single item or not. Returns empty
80 string if there is one item or "s" otherwise.
82 Examples
83 --------
85 .. code-block:: python
87 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
88 """
89 if isinstance(noun, int):
90 num = noun
91 else:
92 num = len(noun)
93 return num, "" if num == 1 else "s"
96@dataclass
97class RawFileDatasetInfo:
98 """Information about a single dataset within a raw file."""
100 dataId: DataCoordinate
101 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
103 obsInfo: ObservationInfo
104 """Standardized observation metadata extracted directly from the file
105 headers (`astro_metadata_translator.ObservationInfo`).
106 """
109@dataclass
110class RawFileData:
111 """Information about a single raw file, used during ingest."""
113 datasets: List[RawFileDatasetInfo]
114 """The information describing each dataset within this raw file.
115 (`list` of `RawFileDatasetInfo`)
116 """
118 filename: ButlerURI
119 """URI of the file this information was extracted from (`str`).
121 This is the path prior to ingest, not the path after ingest.
122 """
124 FormatterClass: Type[FitsRawFormatterBase]
125 """Formatter class that should be used to ingest this file (`type`; as
126 subclass of `FitsRawFormatterBase`).
127 """
129 instrument: Optional[Instrument]
130 """The `Instrument` instance associated with this file. Can be `None`
131 if ``datasets`` is an empty list."""
134@dataclass
135class RawExposureData:
136 """Information about a complete raw exposure, used during ingest."""
138 dataId: DataCoordinate
139 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
140 """
142 files: List[RawFileData]
143 """List of structures containing file-level information.
144 """
146 universe: InitVar[DimensionUniverse]
147 """Set of all known dimensions.
148 """
150 record: Optional[DimensionRecord] = None
151 """The exposure `DimensionRecord` that must be inserted into the
152 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
153 """
155 def __post_init__(self, universe: DimensionUniverse):
156 # We don't care which file or dataset we read metadata from, because
157 # we're assuming they'll all be the same; just use the first ones.
158 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
161def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
162 """Create a Config field with options for transferring data between repos.
164 The allowed options for the field are exactly those supported by
165 `lsst.daf.butler.Datastore.ingest`.
167 Parameters
168 ----------
169 doc : `str`
170 Documentation for the configuration field.
172 Returns
173 -------
174 field : `lsst.pex.config.ChoiceField`
175 Configuration field.
176 """
177 return ChoiceField(
178 doc=doc,
179 dtype=str,
180 allowed={"move": "move",
181 "copy": "copy",
182 "auto": "choice will depend on datastore",
183 "direct": "use URI to ingested file directly in datastore",
184 "link": "hard link falling back to symbolic link",
185 "hardlink": "hard link",
186 "symlink": "symbolic (soft) link",
187 "relsymlink": "relative symbolic link",
188 },
189 optional=True,
190 default=default
191 )
194class RawIngestConfig(Config):
195 """Configuration class for RawIngestTask."""
197 transfer = makeTransferChoiceField()
198 failFast = Field(
199 dtype=bool,
200 default=False,
201 doc="If True, stop ingest as soon as any problem is encountered with any file. "
202 "Otherwise problems files will be skipped and logged and a report issued at completion.",
203 )
206class RawIngestTask(Task):
207 """Driver Task for ingesting raw data into Gen3 Butler repositories.
209 Parameters
210 ----------
211 config : `RawIngestConfig`
212 Configuration for the task.
213 butler : `~lsst.daf.butler.Butler`
214 Writeable butler instance, with ``butler.run`` set to the appropriate
215 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
216 datasets.
217 on_success : `Callable`, optional
218 A callback invoked when all of the raws associated with an exposure
219 are ingested. Will be passed a list of `FileDataset` objects, each
220 containing one or more resolved `DatasetRef` objects. If this callback
221 raises it will interrupt the entire ingest process, even if
222 `RawIngestConfig.failFast` is `False`.
223 on_metadata_failure : `Callable`, optional
224 A callback invoked when a failure occurs trying to translate the
225 metadata for a file. Will be passed the URI and the exception, in
226 that order, as positional arguments. Guaranteed to be called in an
227 ``except`` block, allowing the callback to re-raise or replace (with
228 ``raise ... from``) to override the task's usual error handling (before
229 `RawIngestConfig.failFast` logic occurs).
230 on_ingest_failure : `Callable`, optional
231 A callback invoked when dimension record or dataset insertion into the
232 database fails for an exposure. Will be passed a `RawExposureData`
233 instance and the exception, in that order, as positional arguments.
234 Guaranteed to be called in an ``except`` block, allowing the callback
235 to re-raise or replace (with ``raise ... from``) to override the task's
236 usual error handling (before `RawIngestConfig.failFast` logic occurs).
237 **kwargs
238 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
239 constructor.
241 Notes
242 -----
243 Each instance of `RawIngestTask` writes to the same Butler. Each
244 invocation of `RawIngestTask.run` ingests a list of files.
245 """
247 ConfigClass = RawIngestConfig
249 _DefaultName = "ingest"
251 def getDatasetType(self):
252 """Return the DatasetType of the datasets ingested by this Task."""
253 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
254 universe=self.butler.registry.dimensions)
256 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler,
257 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
258 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing,
259 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
260 **kwargs: Any):
261 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
262 super().__init__(config, **kwargs)
263 self.butler = butler
264 self.universe = self.butler.registry.dimensions
265 self.datasetType = self.getDatasetType()
266 self._on_success = on_success
267 self._on_metadata_failure = on_metadata_failure
268 self._on_ingest_failure = on_ingest_failure
269 self.progress = Progress("obs.base.RawIngestTask")
271 # Import all the instrument classes so that we ensure that we
272 # have all the relevant metadata translators loaded.
273 Instrument.importAll(self.butler.registry)
275 def _reduce_kwargs(self):
276 # Add extra parameters to pickle.
277 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success,
278 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure)
280 def _determine_instrument_formatter(self, dataId, filename):
281 """Determine the instrument and formatter class.
283 Parameters
284 ----------
285 dataId : `lsst.daf.butler.DataCoordinate`
286 The dataId associated with this dataset.
287 filename : `ButlerURI`
288 URI of file used for error reporting.
290 Returns
291 -------
292 instrument : `Instrument` or `None`
293 Instance of the `Instrument` associated with this dataset. `None`
294 indicates that the instrument could not be determined.
295 formatterClass : `type`
296 Class to be used as the formatter for this dataset.
297 """
298 # The data model currently assumes that whilst multiple datasets
299 # can be associated with a single file, they must all share the
300 # same formatter.
301 try:
302 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry)
303 except LookupError as e:
304 self._on_metadata_failure(filename, e)
305 self.log.warning("Instrument %s for file %s not known to registry",
306 dataId["instrument"], filename)
307 if self.config.failFast:
308 raise RuntimeError(f"Instrument {dataId['instrument']} for"
309 f" file {filename} not known to registry") from e
310 FormatterClass = Formatter
311 # Indicate that we could not work out the instrument.
312 instrument = None
313 else:
314 FormatterClass = instrument.getRawFormatter(dataId)
315 return instrument, FormatterClass
317 def extractMetadata(self, filename: ButlerURI) -> RawFileData:
318 """Extract and process metadata from a single raw file.
320 Parameters
321 ----------
322 filename : `ButlerURI`
323 URI to the file.
325 Returns
326 -------
327 data : `RawFileData`
328 A structure containing the metadata extracted from the file,
329 as well as the original filename. All fields will be populated,
330 but the `RawFileData.dataId` attribute will be a minimal
331 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
332 ``instrument`` field will be `None` if there is a problem
333 with metadata extraction.
335 Notes
336 -----
337 Assumes that there is a single dataset associated with the given
338 file. Instruments using a single file to store multiple datasets
339 must implement their own version of this method.
341 By default the method will catch all exceptions unless the ``failFast``
342 configuration item is `True`. If an error is encountered the
343 `_on_metadata_failure()` method will be called. If no exceptions
344 result and an error was encountered the returned object will have
345 a null-instrument class and no datasets.
347 This method supports sidecar JSON files which can be used to
348 extract metadata without having to read the data file itself.
349 The sidecar file is always used if found.
350 """
351 sidecar_fail_msg = "" # Requires prepended space when set.
352 try:
353 sidecar_file = filename.updatedExtension(".json")
354 if sidecar_file.exists():
355 content = json.loads(sidecar_file.read())
356 headers = [process_sidecar_data(content)]
357 sidecar_fail_msg = " (via sidecar)"
358 else:
359 # Read the metadata from the data file itself.
361 # For remote files download the entire file to get the
362 # header. This is very inefficient and it would be better
363 # to have some way of knowing where in the file the headers
364 # are and to only download those parts of the file.
365 with filename.as_local() as local_file:
366 # Read the primary. This might be sufficient.
367 header = readMetadata(local_file.ospath, 0)
369 try:
370 # Try to work out a translator class early.
371 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
372 except ValueError:
373 # Primary header was not sufficient (maybe this file
374 # has been compressed or is a MEF with minimal
375 # primary). Read second header and merge with primary.
376 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
378 # Try again to work out a translator class, letting this
379 # fail.
380 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
382 # Request the headers to use for ingest
383 headers = translator_class.determine_translatable_headers(filename.ospath, header)
385 # Add each header to the dataset list
386 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
388 except Exception as e:
389 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
390 # Indicate to the caller that we failed to read.
391 datasets = []
392 formatterClass = Formatter
393 instrument = None
394 self._on_metadata_failure(filename, e)
395 if self.config.failFast:
396 raise RuntimeError("Problem extracting metadata for file "
397 f"{filename}{sidecar_fail_msg}") from e
398 else:
399 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
400 # The data model currently assumes that whilst multiple datasets
401 # can be associated with a single file, they must all share the
402 # same formatter.
403 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
404 if instrument is None:
405 datasets = []
407 return RawFileData(datasets=datasets, filename=filename,
408 FormatterClass=formatterClass,
409 instrument=instrument)
411 def _calculate_dataset_info(self, header, filename):
412 """Calculate a RawFileDatasetInfo from the supplied information.
414 Parameters
415 ----------
416 header : Mapping or `astro_metadata_translator.ObservationInfo`
417 Header from the dataset or previously-translated content.
418 filename : `ButlerURI`
419 Filename to use for error messages.
421 Returns
422 -------
423 dataset : `RawFileDatasetInfo`
424 The dataId, and observation information associated with this
425 dataset.
426 """
427 # To ensure we aren't slowed down for no reason, explicitly
428 # list here the properties we need for the schema.
429 # Use a dict with values a boolean where True indicates
430 # that it is required that we calculate this property.
431 ingest_subset = {
432 "altaz_begin": False,
433 "boresight_rotation_coord": False,
434 "boresight_rotation_angle": False,
435 "dark_time": False,
436 "datetime_begin": True,
437 "datetime_end": True,
438 "detector_num": True,
439 "exposure_group": False,
440 "exposure_id": True,
441 "exposure_time": True,
442 "instrument": True,
443 "tracking_radec": False,
444 "object": False,
445 "observation_counter": False,
446 "observation_id": True,
447 "observation_reason": False,
448 "observation_type": True,
449 "observing_day": False,
450 "physical_filter": True,
451 "science_program": False,
452 "visit_id": False,
453 }
455 if isinstance(header, ObservationInfo):
456 obsInfo = header
457 missing = []
458 # Need to check the required properties are present.
459 for property, required in ingest_subset.items():
460 if not required:
461 continue
462 # getattr does not need to be protected because it is using
463 # the defined list above containing properties that must exist.
464 value = getattr(obsInfo, property)
465 if value is None:
466 missing.append(property)
467 if missing:
468 raise ValueError(f"Requested required properties are missing from file {filename}:"
469 f" {missing} (via JSON)")
471 else:
472 obsInfo = ObservationInfo(header, pedantic=False, filename=str(filename),
473 required={k for k in ingest_subset if ingest_subset[k]},
474 subset=set(ingest_subset))
476 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
477 exposure=obsInfo.exposure_id,
478 detector=obsInfo.detector_num,
479 universe=self.universe)
480 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
482 def locateAndReadIndexFiles(self, files):
483 """Given a list of files, look for index files and read them.
485 Index files can either be explicitly in the list of files to
486 ingest, or else located in the same directory as a file to ingest.
487 Index entries are always used if present.
489 Parameters
490 ----------
491 files : iterable over `ButlerURI`
492 URIs to the files to be ingested.
494 Returns
495 -------
496 index : `dict` [`str`, Any]
497 Merged contents of all relevant index files found. These can
498 be explicitly specified index files or ones found in the
499 directory alongside a data file to be ingested.
500 updated_files : iterable of `str`
501 Updated list of the input files with entries removed that were
502 found listed in an index file. Order is not guaranteed to
503 match the order of the files given to this routine.
504 bad_index_files: `set[str]`
505 Files that looked like index files but failed to read properly.
506 """
507 # Convert the paths to absolute for easy comparison with index content.
508 # Do not convert to real paths since we have to assume that index
509 # files are in this location and not the location which it links to.
510 files = tuple(f.abspath() for f in files)
512 # Index files must be named this.
513 index_root_file = "_index.json"
515 # Group the files by directory.
516 files_by_directory = defaultdict(set)
518 for path in files:
519 directory, file_in_dir = path.split()
520 files_by_directory[directory].add(file_in_dir)
522 # All the metadata read from index files with keys of full path.
523 index_entries = {}
525 # Index files we failed to read.
526 bad_index_files = set()
528 # Any good index files that were found and used.
529 good_index_files = set()
531 # Look for index files in those directories.
532 for directory, files_in_directory in files_by_directory.items():
533 possible_index_file = directory.join(index_root_file)
534 if possible_index_file.exists():
535 # If we are explicitly requesting an index file the
536 # messages should be different.
537 index_msg = "inferred"
538 is_implied = True
539 if index_root_file in files_in_directory:
540 index_msg = "explicit"
541 is_implied = False
543 # Try to read the index file and catch and report any
544 # problems.
545 try:
546 content = json.loads(possible_index_file.read())
547 index = process_index_data(content, force_dict=True)
548 except Exception as e:
549 # Only trigger the callback if the index file
550 # was asked for explicitly. Triggering on implied file
551 # might be surprising.
552 if not is_implied:
553 self._on_metadata_failure(possible_index_file, e)
554 if self.config.failFast:
555 raise RuntimeError(f"Problem reading index file from {index_msg} "
556 f"location {possible_index_file}") from e
557 bad_index_files.add(possible_index_file)
558 continue
560 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
561 good_index_files.add(possible_index_file)
563 # Go through the index adding entries for files.
564 # If we have non-index files in this directory marked for
565 # ingest we should only get index information for those.
566 # If the index file was explicit we use all entries.
567 if is_implied:
568 files_to_ingest = files_in_directory
569 else:
570 files_to_ingest = set(index)
572 # Copy relevant metadata into a single dict for all index
573 # entries.
574 for file_in_dir in files_to_ingest:
575 # Skip an explicitly specified index file.
576 # This should never happen because an explicit index
577 # file will force ingest of all files in the index
578 # and not use the explicit file list. If somehow
579 # this is not true we continue. Raising an exception
580 # seems like the wrong thing to do since this is harmless.
581 if file_in_dir == index_root_file:
582 self.log.info("Logic error found scanning directory %s. Please file ticket.",
583 directory)
584 continue
585 if file_in_dir in index:
586 file = directory.join(file_in_dir)
587 if file in index_entries:
588 # ObservationInfo overrides raw metadata
589 if isinstance(index[file_in_dir], ObservationInfo) \
590 and not isinstance(index_entries[file], ObservationInfo):
591 self.log.warning("File %s already specified in an index file but overriding"
592 " with ObservationInfo content from %s",
593 file, possible_index_file)
594 else:
595 self.log.warning("File %s already specified in an index file, "
596 "ignoring content from %s", file, possible_index_file)
597 # Do nothing in this case
598 continue
600 index_entries[file] = index[file_in_dir]
602 # Remove files from list that have index entries and also
603 # any files that we determined to be explicit index files
604 # or any index files that we failed to read.
605 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
607 # The filtered list loses the initial order. Retaining the order
608 # is good for testing but does have a cost if there are many
609 # files when copying the good values out. A dict would have faster
610 # lookups (using the files as keys) but use more memory.
611 ordered = [f for f in filtered if f in files]
613 return index_entries, ordered, good_index_files, bad_index_files
615 def processIndexEntries(self, index_entries):
616 """Convert index entries to RawFileData.
618 Parameters
619 ----------
620 index_entries : `dict` [`str`, Any]
621 Dict indexed by name of file to ingest and with keys either
622 raw metadata or translated
623 `~astro_metadata_translator.ObservationInfo`.
625 Returns
626 -------
627 data : `RawFileData`
628 A structure containing the metadata extracted from the file,
629 as well as the original filename. All fields will be populated,
630 but the `RawFileData.dataId` attribute will be a minimal
631 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance.
632 """
633 fileData = []
634 for filename, metadata in index_entries.items():
635 try:
636 datasets = [self._calculate_dataset_info(metadata, filename)]
637 except Exception as e:
638 self.log.debug("Problem extracting metadata for file %s found in index file: %s",
639 filename, e)
640 datasets = []
641 formatterClass = Formatter
642 instrument = None
643 self._on_metadata_failure(filename, e)
644 if self.config.failFast:
645 raise RuntimeError(f"Problem extracting metadata for file {filename} "
646 "found in index file") from e
647 else:
648 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId,
649 filename)
650 if instrument is None:
651 datasets = []
652 fileData.append(RawFileData(datasets=datasets, filename=filename,
653 FormatterClass=formatterClass, instrument=instrument))
654 return fileData
656 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
657 """Group an iterable of `RawFileData` by exposure.
659 Parameters
660 ----------
661 files : iterable of `RawFileData`
662 File-level information to group.
664 Returns
665 -------
666 exposures : `list` of `RawExposureData`
667 A list of structures that group the file-level information by
668 exposure. All fields will be populated. The
669 `RawExposureData.dataId` attributes will be minimal (unexpanded)
670 `~lsst.daf.butler.DataCoordinate` instances.
671 """
672 exposureDimensions = self.universe["exposure"].graph
673 byExposure = defaultdict(list)
674 for f in files:
675 # Assume that the first dataset is representative for the file.
676 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
678 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
679 for dataId, exposureFiles in byExposure.items()]
681 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
682 """Expand the data IDs associated with a raw exposure.
684 This adds the metadata records.
686 Parameters
687 ----------
688 exposure : `RawExposureData`
689 A structure containing information about the exposure to be
690 ingested. Must have `RawExposureData.records` populated. Should
691 be considered consumed upon return.
693 Returns
694 -------
695 exposure : `RawExposureData`
696 An updated version of the input structure, with
697 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
698 updated to data IDs for which
699 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
700 """
701 # We start by expanded the exposure-level data ID; we won't use that
702 # directly in file ingest, but this lets us do some database lookups
703 # once per exposure instead of once per file later.
704 data.dataId = self.butler.registry.expandDataId(
705 data.dataId,
706 # We pass in the records we'll be inserting shortly so they aren't
707 # looked up from the database. We do expect instrument and filter
708 # records to be retrieved from the database here (though the
709 # Registry may cache them so there isn't a lookup every time).
710 records={
711 self.butler.registry.dimensions["exposure"]: data.record,
712 }
713 )
714 # Now we expand the per-file (exposure+detector) data IDs. This time
715 # we pass in the records we just retrieved from the exposure data ID
716 # expansion.
717 for file in data.files:
718 for dataset in file.datasets:
719 dataset.dataId = self.butler.registry.expandDataId(
720 dataset.dataId,
721 records=dict(data.dataId.records)
722 )
723 return data
725 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1
726 ) -> Tuple[Iterator[RawExposureData], List[str]]:
727 """Perform all non-database-updating ingest preprocessing steps.
729 Parameters
730 ----------
731 files : iterable over `str` or path-like objects
732 Paths to the files to be ingested. Will be made absolute
733 if they are not already.
734 pool : `multiprocessing.Pool`, optional
735 If not `None`, a process pool with which to parallelize some
736 operations.
737 processes : `int`, optional
738 The number of processes to use. Ignored if ``pool`` is not `None`.
740 Returns
741 -------
742 exposures : `Iterator` [ `RawExposureData` ]
743 Data structures containing dimension records, filenames, and data
744 IDs to be ingested (one structure for each exposure).
745 bad_files : `list` of `str`
746 List of all the files that could not have metadata extracted.
747 """
748 if pool is None and processes > 1:
749 pool = Pool(processes)
750 mapFunc = map if pool is None else pool.imap_unordered
752 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]:
753 """Filter out bad files and return good with list of bad."""
754 good_files = []
755 bad_files = []
756 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata", total=len(files)):
757 if not fileDatum.datasets:
758 bad_files.append(fileDatum.filename)
759 else:
760 good_files.append(fileDatum)
761 return good_files, bad_files
763 # Look for index files and read them.
764 # There should be far fewer index files than data files.
765 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
766 if bad_index_files:
767 self.log.info("Failed to read the following explicitly requested index files:"),
768 for bad in sorted(bad_index_files):
769 self.log.info("- %s", bad)
771 # Now convert all the index file entries to standard form for ingest.
772 bad_index_file_data = []
773 indexFileData = self.processIndexEntries(index_entries)
774 if indexFileData:
775 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData)
776 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s"
777 " with %d failure%s",
778 *_log_msg_counter(indexFileData),
779 *_log_msg_counter(good_index_files),
780 *_log_msg_counter(bad_index_file_data))
782 # Extract metadata and build per-detector regions.
783 # This could run in a subprocess so collect all output
784 # before looking at failures.
785 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
787 # Filter out all the failed reads and store them for later
788 # reporting.
789 fileData, bad_files = _partition_good_bad(fileData)
790 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s",
791 *_log_msg_counter(fileData),
792 *_log_msg_counter(bad_files))
794 # Combine with data from index files.
795 fileData.extend(indexFileData)
796 bad_files.extend(bad_index_file_data)
797 bad_files.extend(bad_index_files)
799 # Use that metadata to group files (and extracted metadata) by
800 # exposure. Never parallelized because it's intrinsically a gather
801 # step.
802 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
804 # The next operation operates on RawExposureData instances (one at
805 # a time) in-place and then returns the modified instance. We call it
806 # as a pass-through instead of relying on the arguments we pass in to
807 # have been modified because in the parallel case those arguments are
808 # going to be pickled and unpickled, and I'm not certain
809 # multiprocessing is careful enough with that for output arguments to
810 # work.
812 # Expand the data IDs to include all dimension metadata; we need this
813 # because we may need to generate path templates that rely on that
814 # metadata.
815 # This is the first step that involves actual database calls (but just
816 # SELECTs), so if there's going to be a problem with connections vs.
817 # multiple processes, or lock contention (in SQLite) slowing things
818 # down, it'll happen here.
819 return mapFunc(self.expandDataIds, exposureData), bad_files
821 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
822 ) -> List[FileDataset]:
823 """Ingest all raw files in one exposure.
825 Parameters
826 ----------
827 exposure : `RawExposureData`
828 A structure containing information about the exposure to be
829 ingested. Must have `RawExposureData.records` populated and all
830 data ID attributes expanded.
831 run : `str`, optional
832 Name of a RUN-type collection to write to, overriding
833 ``self.butler.run``.
835 Returns
836 -------
837 datasets : `list` of `lsst.daf.butler.FileDataset`
838 Per-file structures identifying the files ingested and their
839 dataset representation in the data repository.
840 """
841 datasets = [FileDataset(path=file.filename.abspath(),
842 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
843 formatter=file.FormatterClass)
844 for file in exposure.files]
845 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
846 return datasets
848 def ingestFiles(self, files, *, pool: Optional[Pool] = None, processes: int = 1,
849 run: Optional[str] = None):
850 """Ingest files into a Butler data repository.
852 This creates any new exposure or visit Dimension entries needed to
853 identify the ingested files, creates new Dataset entries in the
854 Registry and finally ingests the files themselves into the Datastore.
855 Any needed instrument, detector, and physical_filter Dimension entries
856 must exist in the Registry before `run` is called.
858 Parameters
859 ----------
860 files : iterable over `ButlerURI`
861 URIs to the files to be ingested.
862 pool : `multiprocessing.Pool`, optional
863 If not `None`, a process pool with which to parallelize some
864 operations.
865 processes : `int`, optional
866 The number of processes to use. Ignored if ``pool`` is not `None`.
867 run : `str`, optional
868 Name of a RUN-type collection to write to, overriding
869 the default derived from the instrument name.
871 Returns
872 -------
873 refs : `list` of `lsst.daf.butler.DatasetRef`
874 Dataset references for ingested raws.
875 """
877 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
879 # Up to this point, we haven't modified the data repository at all.
880 # Now we finally do that, with one transaction per exposure. This is
881 # not parallelized at present because the performance of this step is
882 # limited by the database server. That may or may not change in the
883 # future once we increase our usage of bulk inserts and reduce our
884 # usage of savepoints; we've tried to get everything but the database
885 # operations done in advance to reduce the time spent inside
886 # transactions.
887 self.butler.registry.registerDatasetType(self.datasetType)
889 refs = []
890 runs = set()
891 n_exposures = 0
892 n_exposures_failed = 0
893 n_ingests_failed = 0
894 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
896 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s",
897 *_log_msg_counter(exposure.files),
898 exposure.record.instrument, exposure.record.obs_id)
900 try:
901 self.butler.registry.syncDimensionData("exposure", exposure.record)
902 except Exception as e:
903 self._on_ingest_failure(exposure, e)
904 n_exposures_failed += 1
905 self.log.warning("Exposure %s:%s could not be registered: %s",
906 exposure.record.instrument, exposure.record.obs_id, e)
907 if self.config.failFast:
908 raise e
909 continue
911 # Override default run if nothing specified explicitly.
912 if run is None:
913 instrument = exposure.files[0].instrument
914 this_run = instrument.makeDefaultRawIngestRunName()
915 else:
916 this_run = run
917 if this_run not in runs:
918 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
919 runs.add(this_run)
920 try:
921 with self.butler.transaction():
922 datasets_for_exposure = self.ingestExposureDatasets(exposure, run=this_run)
923 except Exception as e:
924 self._on_ingest_failure(exposure, e)
925 n_ingests_failed += 1
926 self.log.warning("Failed to ingest the following for reason: %s", e)
927 for f in exposure.files:
928 self.log.warning("- %s", f.filename)
929 if self.config.failFast:
930 raise e
931 continue
932 else:
933 self._on_success(datasets_for_exposure)
934 for dataset in datasets_for_exposure:
935 refs.extend(dataset.refs)
937 # Success for this exposure.
938 n_exposures += 1
939 self.log.info("Exposure %s:%s ingested successfully",
940 exposure.record.instrument, exposure.record.obs_id)
942 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
944 @timeMethod
945 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None,
946 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", group_files: bool = True):
947 """Ingest files into a Butler data repository.
949 This creates any new exposure or visit Dimension entries needed to
950 identify the ingested files, creates new Dataset entries in the
951 Registry and finally ingests the files themselves into the Datastore.
952 Any needed instrument, detector, and physical_filter Dimension entries
953 must exist in the Registry before `run` is called.
955 Parameters
956 ----------
957 files : iterable over `ButlerURI`, `str` or path-like objects
958 Paths to the files to be ingested. Can refer to directories.
959 Will be made absolute if they are not already.
960 pool : `multiprocessing.Pool`, optional
961 If not `None`, a process pool with which to parallelize some
962 operations.
963 processes : `int`, optional
964 The number of processes to use. Ignored if ``pool`` is not `None`.
965 run : `str`, optional
966 Name of a RUN-type collection to write to, overriding
967 the default derived from the instrument name.
968 file_filter : `str` or `re.Pattern`, optional
969 Pattern to use to discover files to ingest within directories.
970 The default is to search for FITS files. The regex applies to
971 files within the directory.
972 group_files : `bool`, optional
973 Group files by directory if they have been discovered in
974 directories. Will not affect files explicitly provided.
976 Returns
977 -------
978 refs : `list` of `lsst.daf.butler.DatasetRef`
979 Dataset references for ingested raws.
981 Notes
982 -----
983 This method inserts all datasets for an exposure within a transaction,
984 guaranteeing that partial exposures are never ingested. The exposure
985 dimension record is inserted with `Registry.syncDimensionData` first
986 (in its own transaction), which inserts only if a record with the same
987 primary key does not already exist. This allows different files within
988 the same exposure to be incremented in different runs.
989 """
991 refs = []
992 bad_files = []
993 n_exposures = 0
994 n_exposures_failed = 0
995 n_ingests_failed = 0
996 if group_files:
997 for group in ButlerURI.findFileResources(files, file_filter, group_files):
998 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(group, pool=pool,
999 processes=processes,
1000 run=run)
1001 refs.extend(new_refs)
1002 bad_files.extend(bad)
1003 n_exposures += n_exp
1004 n_exposures_failed += n_exp_fail
1005 n_ingests_failed += n_ingest_fail
1006 else:
1007 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1008 ButlerURI.findFileResources(files, file_filter, group_files),
1009 pool=pool,
1010 processes=processes,
1011 run=run,
1012 )
1014 had_failure = False
1016 if bad_files:
1017 had_failure = True
1018 self.log.warning("Could not extract observation metadata from the following:")
1019 for f in bad_files:
1020 self.log.warning("- %s", f)
1022 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure"
1023 " registration and %d failure%s from file ingest.",
1024 *_log_msg_counter(n_exposures),
1025 *_log_msg_counter(n_exposures_failed),
1026 *_log_msg_counter(n_ingests_failed))
1027 if n_exposures_failed > 0 or n_ingests_failed > 0:
1028 had_failure = True
1029 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1031 if had_failure:
1032 raise RuntimeError("Some failures encountered during ingestion")
1034 return refs