Coverage for python/lsst/obs/base/ingest.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import json
26import re
27from collections import defaultdict
28from dataclasses import InitVar, dataclass
29from multiprocessing import Pool
30from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Type, Union
32from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers
33from astro_metadata_translator.indexing import process_index_data, process_sidecar_data
34from lsst.afw.fits import readMetadata
35from lsst.daf.butler import (
36 Butler,
37 CollectionType,
38 DataCoordinate,
39 DatasetIdGenEnum,
40 DatasetRef,
41 DatasetType,
42 DimensionRecord,
43 DimensionUniverse,
44 FileDataset,
45 Formatter,
46 Progress,
47)
48from lsst.pex.config import ChoiceField, Config, Field
49from lsst.pipe.base import Task
50from lsst.resources import ResourcePath
51from lsst.utils.timer import timeMethod
53from ._fitsRawFormatterBase import FitsRawFormatterBase
54from ._instrument import Instrument, makeExposureRecordFromObsInfo
57def _do_nothing(*args, **kwargs) -> None:
58 """Do nothing.
60 This is a function that accepts anything and does nothing.
61 For use as a default in callback arguments.
62 """
63 pass
66def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]:
67 """Count the iterable and return the count and plural modifier.
69 Parameters
70 ----------
71 noun : Iterable or `int`
72 Thing to count. If given an integer it is assumed to be the count
73 to use to calculate modifier.
75 Returns
76 -------
77 num : `int`
78 Number of items found in ``noun``.
79 modifier : `str`
80 Character to add to the end of a string referring to these items
81 to indicate whether it was a single item or not. Returns empty
82 string if there is one item or "s" otherwise.
84 Examples
85 --------
87 .. code-block:: python
89 log.warning("Found %d file%s", *_log_msg_counter(nfiles))
90 """
91 if isinstance(noun, int):
92 num = noun
93 else:
94 num = len(noun)
95 return num, "" if num == 1 else "s"
98@dataclass
99class RawFileDatasetInfo:
100 """Information about a single dataset within a raw file."""
102 dataId: DataCoordinate
103 """Data ID for this file (`lsst.daf.butler.DataCoordinate`)."""
105 obsInfo: ObservationInfo
106 """Standardized observation metadata extracted directly from the file
107 headers (`astro_metadata_translator.ObservationInfo`).
108 """
111@dataclass
112class RawFileData:
113 """Information about a single raw file, used during ingest."""
115 datasets: List[RawFileDatasetInfo]
116 """The information describing each dataset within this raw file.
117 (`list` of `RawFileDatasetInfo`)
118 """
120 filename: ResourcePath
121 """URI of the file this information was extracted from (`str`).
123 This is the path prior to ingest, not the path after ingest.
124 """
126 FormatterClass: Type[FitsRawFormatterBase]
127 """Formatter class that should be used to ingest this file (`type`; as
128 subclass of `FitsRawFormatterBase`).
129 """
131 instrument: Optional[Instrument]
132 """The `Instrument` instance associated with this file. Can be `None`
133 if ``datasets`` is an empty list."""
136@dataclass
137class RawExposureData:
138 """Information about a complete raw exposure, used during ingest."""
140 dataId: DataCoordinate
141 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
142 """
144 files: List[RawFileData]
145 """List of structures containing file-level information.
146 """
148 universe: InitVar[DimensionUniverse]
149 """Set of all known dimensions.
150 """
152 record: Optional[DimensionRecord] = None
153 """The exposure `DimensionRecord` that must be inserted into the
154 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
155 """
157 def __post_init__(self, universe: DimensionUniverse):
158 # We don't care which file or dataset we read metadata from, because
159 # we're assuming they'll all be the same; just use the first ones.
160 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
163def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
164 """Create a Config field with options for transferring data between repos.
166 The allowed options for the field are exactly those supported by
167 `lsst.daf.butler.Datastore.ingest`.
169 Parameters
170 ----------
171 doc : `str`
172 Documentation for the configuration field.
174 Returns
175 -------
176 field : `lsst.pex.config.ChoiceField`
177 Configuration field.
178 """
179 return ChoiceField(
180 doc=doc,
181 dtype=str,
182 allowed={
183 "move": "move",
184 "copy": "copy",
185 "auto": "choice will depend on datastore",
186 "direct": "use URI to ingested file directly in datastore",
187 "link": "hard link falling back to symbolic link",
188 "hardlink": "hard link",
189 "symlink": "symbolic (soft) link",
190 "relsymlink": "relative symbolic link",
191 },
192 optional=True,
193 default=default,
194 )
197class RawIngestConfig(Config):
198 """Configuration class for RawIngestTask."""
200 transfer = makeTransferChoiceField()
201 failFast = Field(
202 dtype=bool,
203 default=False,
204 doc="If True, stop ingest as soon as any problem is encountered with any file. "
205 "Otherwise problems files will be skipped and logged and a report issued at completion.",
206 )
209class RawIngestTask(Task):
210 """Driver Task for ingesting raw data into Gen3 Butler repositories.
212 Parameters
213 ----------
214 config : `RawIngestConfig`
215 Configuration for the task.
216 butler : `~lsst.daf.butler.Butler`
217 Writeable butler instance, with ``butler.run`` set to the appropriate
218 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
219 datasets.
220 on_success : `Callable`, optional
221 A callback invoked when all of the raws associated with an exposure
222 are ingested. Will be passed a list of `FileDataset` objects, each
223 containing one or more resolved `DatasetRef` objects. If this callback
224 raises it will interrupt the entire ingest process, even if
225 `RawIngestConfig.failFast` is `False`.
226 on_metadata_failure : `Callable`, optional
227 A callback invoked when a failure occurs trying to translate the
228 metadata for a file. Will be passed the URI and the exception, in
229 that order, as positional arguments. Guaranteed to be called in an
230 ``except`` block, allowing the callback to re-raise or replace (with
231 ``raise ... from``) to override the task's usual error handling (before
232 `RawIngestConfig.failFast` logic occurs).
233 on_ingest_failure : `Callable`, optional
234 A callback invoked when dimension record or dataset insertion into the
235 database fails for an exposure. Will be passed a `RawExposureData`
236 instance and the exception, in that order, as positional arguments.
237 Guaranteed to be called in an ``except`` block, allowing the callback
238 to re-raise or replace (with ``raise ... from``) to override the task's
239 usual error handling (before `RawIngestConfig.failFast` logic occurs).
240 **kwargs
241 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
242 constructor.
244 Notes
245 -----
246 Each instance of `RawIngestTask` writes to the same Butler. Each
247 invocation of `RawIngestTask.run` ingests a list of files.
248 """
250 ConfigClass = RawIngestConfig
252 _DefaultName = "ingest"
254 def getDatasetType(self):
255 """Return the DatasetType of the datasets ingested by this Task."""
256 return DatasetType(
257 "raw",
258 ("instrument", "detector", "exposure"),
259 "Exposure",
260 universe=self.butler.registry.dimensions,
261 )
263 def __init__(
264 self,
265 config: Optional[RawIngestConfig] = None,
266 *,
267 butler: Butler,
268 on_success: Callable[[List[FileDataset]], Any] = _do_nothing,
269 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing,
270 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing,
271 **kwargs: Any,
272 ):
273 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
274 super().__init__(config, **kwargs)
275 self.butler = butler
276 self.universe = self.butler.registry.dimensions
277 self.datasetType = self.getDatasetType()
278 self._on_success = on_success
279 self._on_metadata_failure = on_metadata_failure
280 self._on_ingest_failure = on_ingest_failure
281 self.progress = Progress("obs.base.RawIngestTask")
283 # Import all the instrument classes so that we ensure that we
284 # have all the relevant metadata translators loaded.
285 Instrument.importAll(self.butler.registry)
287 def _reduce_kwargs(self):
288 # Add extra parameters to pickle.
289 return dict(
290 **super()._reduce_kwargs(),
291 butler=self.butler,
292 on_success=self._on_success,
293 on_metadata_failure=self._on_metadata_failure,
294 on_ingest_failure=self._on_ingest_failure,
295 )
297 def _determine_instrument_formatter(self, dataId, filename):
298 """Determine the instrument and formatter class.
300 Parameters
301 ----------
302 dataId : `lsst.daf.butler.DataCoordinate`
303 The dataId associated with this dataset.
304 filename : `lsst.resources.ResourcePath`
305 URI of file used for error reporting.
307 Returns
308 -------
309 instrument : `Instrument` or `None`
310 Instance of the `Instrument` associated with this dataset. `None`
311 indicates that the instrument could not be determined.
312 formatterClass : `type`
313 Class to be used as the formatter for this dataset.
314 """
315 # The data model currently assumes that whilst multiple datasets
316 # can be associated with a single file, they must all share the
317 # same formatter.
318 try:
319 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry)
320 except LookupError as e:
321 self._on_metadata_failure(filename, e)
322 self.log.warning(
323 "Instrument %s for file %s not known to registry", dataId["instrument"], filename
324 )
325 if self.config.failFast:
326 raise RuntimeError(
327 f"Instrument {dataId['instrument']} for file {filename} not known to registry"
328 ) from e
329 FormatterClass = Formatter
330 # Indicate that we could not work out the instrument.
331 instrument = None
332 else:
333 FormatterClass = instrument.getRawFormatter(dataId)
334 return instrument, FormatterClass
336 def extractMetadata(self, filename: ResourcePath) -> RawFileData:
337 """Extract and process metadata from a single raw file.
339 Parameters
340 ----------
341 filename : `lsst.resources.ResourcePath`
342 URI to the file.
344 Returns
345 -------
346 data : `RawFileData`
347 A structure containing the metadata extracted from the file,
348 as well as the original filename. All fields will be populated,
349 but the `RawFileData.dataId` attribute will be a minimal
350 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The
351 ``instrument`` field will be `None` if there is a problem
352 with metadata extraction.
354 Notes
355 -----
356 Assumes that there is a single dataset associated with the given
357 file. Instruments using a single file to store multiple datasets
358 must implement their own version of this method.
360 By default the method will catch all exceptions unless the ``failFast``
361 configuration item is `True`. If an error is encountered the
362 `_on_metadata_failure()` method will be called. If no exceptions
363 result and an error was encountered the returned object will have
364 a null-instrument class and no datasets.
366 This method supports sidecar JSON files which can be used to
367 extract metadata without having to read the data file itself.
368 The sidecar file is always used if found.
369 """
370 sidecar_fail_msg = "" # Requires prepended space when set.
371 try:
372 sidecar_file = filename.updatedExtension(".json")
373 if sidecar_file.exists():
374 content = json.loads(sidecar_file.read())
375 headers = [process_sidecar_data(content)]
376 sidecar_fail_msg = " (via sidecar)"
377 else:
378 # Read the metadata from the data file itself.
380 # For remote files download the entire file to get the
381 # header. This is very inefficient and it would be better
382 # to have some way of knowing where in the file the headers
383 # are and to only download those parts of the file.
384 with filename.as_local() as local_file:
385 # Read the primary. This might be sufficient.
386 header = readMetadata(local_file.ospath, 0)
388 try:
389 # Try to work out a translator class early.
390 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
391 except ValueError:
392 # Primary header was not sufficient (maybe this file
393 # has been compressed or is a MEF with minimal
394 # primary). Read second header and merge with primary.
395 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite")
397 # Try again to work out a translator class, letting this
398 # fail.
399 translator_class = MetadataTranslator.determine_translator(header, filename=filename)
401 # Request the headers to use for ingest
402 headers = translator_class.determine_translatable_headers(filename.ospath, header)
404 # Add each header to the dataset list
405 datasets = [self._calculate_dataset_info(h, filename) for h in headers]
407 except Exception as e:
408 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e)
409 # Indicate to the caller that we failed to read.
410 datasets = []
411 formatterClass = Formatter
412 instrument = None
413 self._on_metadata_failure(filename, e)
414 if self.config.failFast:
415 raise RuntimeError(
416 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}"
417 ) from e
418 else:
419 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg)
420 # The data model currently assumes that whilst multiple datasets
421 # can be associated with a single file, they must all share the
422 # same formatter.
423 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename)
424 if instrument is None:
425 datasets = []
427 return RawFileData(
428 datasets=datasets, filename=filename, FormatterClass=formatterClass, instrument=instrument
429 )
431 def _calculate_dataset_info(self, header, filename):
432 """Calculate a RawFileDatasetInfo from the supplied information.
434 Parameters
435 ----------
436 header : Mapping or `astro_metadata_translator.ObservationInfo`
437 Header from the dataset or previously-translated content.
438 filename : `lsst.resources.ResourcePath`
439 Filename to use for error messages.
441 Returns
442 -------
443 dataset : `RawFileDatasetInfo`
444 The dataId, and observation information associated with this
445 dataset.
446 """
447 # To ensure we aren't slowed down for no reason, explicitly
448 # list here the properties we need for the schema.
449 # Use a dict with values a boolean where True indicates
450 # that it is required that we calculate this property.
451 ingest_subset = {
452 "altaz_begin": False,
453 "boresight_rotation_coord": False,
454 "boresight_rotation_angle": False,
455 "dark_time": False,
456 "datetime_begin": True,
457 "datetime_end": True,
458 "detector_num": True,
459 "exposure_group": False,
460 "exposure_id": True,
461 "exposure_time": True,
462 "instrument": True,
463 "tracking_radec": False,
464 "object": False,
465 "observation_counter": False,
466 "observation_id": True,
467 "observation_reason": False,
468 "observation_type": True,
469 "observing_day": False,
470 "physical_filter": True,
471 "science_program": False,
472 "visit_id": False,
473 }
475 if isinstance(header, ObservationInfo):
476 obsInfo = header
477 missing = []
478 # Need to check the required properties are present.
479 for property, required in ingest_subset.items():
480 if not required:
481 continue
482 # getattr does not need to be protected because it is using
483 # the defined list above containing properties that must exist.
484 value = getattr(obsInfo, property)
485 if value is None:
486 missing.append(property)
487 if missing:
488 raise ValueError(
489 f"Requested required properties are missing from file {filename}:"
490 f" {missing} (via JSON)"
491 )
493 else:
494 obsInfo = ObservationInfo(
495 header,
496 pedantic=False,
497 filename=str(filename),
498 required={k for k in ingest_subset if ingest_subset[k]},
499 subset=set(ingest_subset),
500 )
502 dataId = DataCoordinate.standardize(
503 instrument=obsInfo.instrument,
504 exposure=obsInfo.exposure_id,
505 detector=obsInfo.detector_num,
506 universe=self.universe,
507 )
508 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
510 def locateAndReadIndexFiles(self, files):
511 """Given a list of files, look for index files and read them.
513 Index files can either be explicitly in the list of files to
514 ingest, or else located in the same directory as a file to ingest.
515 Index entries are always used if present.
517 Parameters
518 ----------
519 files : iterable over `lsst.resources.ResourcePath`
520 URIs to the files to be ingested.
522 Returns
523 -------
524 index : `dict` [`str`, Any]
525 Merged contents of all relevant index files found. These can
526 be explicitly specified index files or ones found in the
527 directory alongside a data file to be ingested.
528 updated_files : iterable of `str`
529 Updated list of the input files with entries removed that were
530 found listed in an index file. Order is not guaranteed to
531 match the order of the files given to this routine.
532 bad_index_files: `set[str]`
533 Files that looked like index files but failed to read properly.
534 """
535 # Convert the paths to absolute for easy comparison with index content.
536 # Do not convert to real paths since we have to assume that index
537 # files are in this location and not the location which it links to.
538 files = tuple(f.abspath() for f in files)
540 # Index files must be named this.
541 index_root_file = "_index.json"
543 # Group the files by directory.
544 files_by_directory = defaultdict(set)
546 for path in files:
547 directory, file_in_dir = path.split()
548 files_by_directory[directory].add(file_in_dir)
550 # All the metadata read from index files with keys of full path.
551 index_entries = {}
553 # Index files we failed to read.
554 bad_index_files = set()
556 # Any good index files that were found and used.
557 good_index_files = set()
559 # Look for index files in those directories.
560 for directory, files_in_directory in files_by_directory.items():
561 possible_index_file = directory.join(index_root_file)
562 if possible_index_file.exists():
563 # If we are explicitly requesting an index file the
564 # messages should be different.
565 index_msg = "inferred"
566 is_implied = True
567 if index_root_file in files_in_directory:
568 index_msg = "explicit"
569 is_implied = False
571 # Try to read the index file and catch and report any
572 # problems.
573 try:
574 content = json.loads(possible_index_file.read())
575 index = process_index_data(content, force_dict=True)
576 except Exception as e:
577 # Only trigger the callback if the index file
578 # was asked for explicitly. Triggering on implied file
579 # might be surprising.
580 if not is_implied:
581 self._on_metadata_failure(possible_index_file, e)
582 if self.config.failFast:
583 raise RuntimeError(
584 f"Problem reading index file from {index_msg} location {possible_index_file}"
585 ) from e
586 bad_index_files.add(possible_index_file)
587 continue
589 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file)
590 good_index_files.add(possible_index_file)
592 # Go through the index adding entries for files.
593 # If we have non-index files in this directory marked for
594 # ingest we should only get index information for those.
595 # If the index file was explicit we use all entries.
596 if is_implied:
597 files_to_ingest = files_in_directory
598 else:
599 files_to_ingest = set(index)
601 # Copy relevant metadata into a single dict for all index
602 # entries.
603 for file_in_dir in files_to_ingest:
604 # Skip an explicitly specified index file.
605 # This should never happen because an explicit index
606 # file will force ingest of all files in the index
607 # and not use the explicit file list. If somehow
608 # this is not true we continue. Raising an exception
609 # seems like the wrong thing to do since this is harmless.
610 if file_in_dir == index_root_file:
611 self.log.info(
612 "Logic error found scanning directory %s. Please file ticket.", directory
613 )
614 continue
615 if file_in_dir in index:
616 file = directory.join(file_in_dir)
617 if file in index_entries:
618 # ObservationInfo overrides raw metadata
619 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance(
620 index_entries[file], ObservationInfo
621 ):
622 self.log.warning(
623 "File %s already specified in an index file but overriding"
624 " with ObservationInfo content from %s",
625 file,
626 possible_index_file,
627 )
628 else:
629 self.log.warning(
630 "File %s already specified in an index file, ignoring content from %s",
631 file,
632 possible_index_file,
633 )
634 # Do nothing in this case
635 continue
637 index_entries[file] = index[file_in_dir]
639 # Remove files from list that have index entries and also
640 # any files that we determined to be explicit index files
641 # or any index files that we failed to read.
642 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files
644 # The filtered list loses the initial order. Retaining the order
645 # is good for testing but does have a cost if there are many
646 # files when copying the good values out. A dict would have faster
647 # lookups (using the files as keys) but use more memory.
648 ordered = [f for f in filtered if f in files]
650 return index_entries, ordered, good_index_files, bad_index_files
652 def processIndexEntries(self, index_entries):
653 """Convert index entries to RawFileData.
655 Parameters
656 ----------
657 index_entries : `dict` [`str`, Any]
658 Dict indexed by name of file to ingest and with keys either
659 raw metadata or translated
660 `~astro_metadata_translator.ObservationInfo`.
662 Returns
663 -------
664 data : `RawFileData`
665 A structure containing the metadata extracted from the file,
666 as well as the original filename. All fields will be populated,
667 but the `RawFileData.dataId` attribute will be a minimal
668 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance.
669 """
670 fileData = []
671 for filename, metadata in index_entries.items():
672 try:
673 datasets = [self._calculate_dataset_info(metadata, filename)]
674 except Exception as e:
675 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e)
676 datasets = []
677 formatterClass = Formatter
678 instrument = None
679 self._on_metadata_failure(filename, e)
680 if self.config.failFast:
681 raise RuntimeError(
682 f"Problem extracting metadata for file {filename} found in index file"
683 ) from e
684 else:
685 instrument, formatterClass = self._determine_instrument_formatter(
686 datasets[0].dataId, filename
687 )
688 if instrument is None:
689 datasets = []
690 fileData.append(
691 RawFileData(
692 datasets=datasets, filename=filename, FormatterClass=formatterClass, instrument=instrument
693 )
694 )
695 return fileData
697 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
698 """Group an iterable of `RawFileData` by exposure.
700 Parameters
701 ----------
702 files : iterable of `RawFileData`
703 File-level information to group.
705 Returns
706 -------
707 exposures : `list` of `RawExposureData`
708 A list of structures that group the file-level information by
709 exposure. All fields will be populated. The
710 `RawExposureData.dataId` attributes will be minimal (unexpanded)
711 `~lsst.daf.butler.DataCoordinate` instances.
712 """
713 exposureDimensions = self.universe["exposure"].graph
714 byExposure = defaultdict(list)
715 for f in files:
716 # Assume that the first dataset is representative for the file.
717 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
719 return [
720 RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
721 for dataId, exposureFiles in byExposure.items()
722 ]
724 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
725 """Expand the data IDs associated with a raw exposure.
727 This adds the metadata records.
729 Parameters
730 ----------
731 exposure : `RawExposureData`
732 A structure containing information about the exposure to be
733 ingested. Must have `RawExposureData.records` populated. Should
734 be considered consumed upon return.
736 Returns
737 -------
738 exposure : `RawExposureData`
739 An updated version of the input structure, with
740 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
741 updated to data IDs for which
742 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`.
743 """
744 # We start by expanded the exposure-level data ID; we won't use that
745 # directly in file ingest, but this lets us do some database lookups
746 # once per exposure instead of once per file later.
747 data.dataId = self.butler.registry.expandDataId(
748 data.dataId,
749 # We pass in the records we'll be inserting shortly so they aren't
750 # looked up from the database. We do expect instrument and filter
751 # records to be retrieved from the database here (though the
752 # Registry may cache them so there isn't a lookup every time).
753 records={
754 self.butler.registry.dimensions["exposure"]: data.record,
755 },
756 )
757 # Now we expand the per-file (exposure+detector) data IDs. This time
758 # we pass in the records we just retrieved from the exposure data ID
759 # expansion.
760 for file in data.files:
761 for dataset in file.datasets:
762 dataset.dataId = self.butler.registry.expandDataId(
763 dataset.dataId, records=dict(data.dataId.records)
764 )
765 return data
767 def prep(
768 self, files, *, pool: Optional[Pool] = None, processes: int = 1
769 ) -> Tuple[Iterator[RawExposureData], List[str]]:
770 """Perform all non-database-updating ingest preprocessing steps.
772 Parameters
773 ----------
774 files : iterable over `str` or path-like objects
775 Paths to the files to be ingested. Will be made absolute
776 if they are not already.
777 pool : `multiprocessing.Pool`, optional
778 If not `None`, a process pool with which to parallelize some
779 operations.
780 processes : `int`, optional
781 The number of processes to use. Ignored if ``pool`` is not `None`.
783 Returns
784 -------
785 exposures : `Iterator` [ `RawExposureData` ]
786 Data structures containing dimension records, filenames, and data
787 IDs to be ingested (one structure for each exposure).
788 bad_files : `list` of `str`
789 List of all the files that could not have metadata extracted.
790 """
791 if pool is None and processes > 1:
792 pool = Pool(processes)
793 mapFunc = map if pool is None else pool.imap_unordered
795 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]:
796 """Filter out bad files and return good with list of bad."""
797 good_files = []
798 bad_files = []
799 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata", total=len(files)):
800 if not fileDatum.datasets:
801 bad_files.append(fileDatum.filename)
802 else:
803 good_files.append(fileDatum)
804 return good_files, bad_files
806 # Look for index files and read them.
807 # There should be far fewer index files than data files.
808 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files)
809 if bad_index_files:
810 self.log.info("Failed to read the following explicitly requested index files:"),
811 for bad in sorted(bad_index_files):
812 self.log.info("- %s", bad)
814 # Now convert all the index file entries to standard form for ingest.
815 bad_index_file_data = []
816 indexFileData = self.processIndexEntries(index_entries)
817 if indexFileData:
818 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData)
819 self.log.info(
820 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s",
821 *_log_msg_counter(indexFileData),
822 *_log_msg_counter(good_index_files),
823 *_log_msg_counter(bad_index_file_data),
824 )
826 # Extract metadata and build per-detector regions.
827 # This could run in a subprocess so collect all output
828 # before looking at failures.
829 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
831 # Filter out all the failed reads and store them for later
832 # reporting.
833 fileData, bad_files = _partition_good_bad(fileData)
834 self.log.info(
835 "Successfully extracted metadata from %d file%s with %d failure%s",
836 *_log_msg_counter(fileData),
837 *_log_msg_counter(bad_files),
838 )
840 # Combine with data from index files.
841 fileData.extend(indexFileData)
842 bad_files.extend(bad_index_file_data)
843 bad_files.extend(bad_index_files)
845 # Use that metadata to group files (and extracted metadata) by
846 # exposure. Never parallelized because it's intrinsically a gather
847 # step.
848 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
850 # The next operation operates on RawExposureData instances (one at
851 # a time) in-place and then returns the modified instance. We call it
852 # as a pass-through instead of relying on the arguments we pass in to
853 # have been modified because in the parallel case those arguments are
854 # going to be pickled and unpickled, and I'm not certain
855 # multiprocessing is careful enough with that for output arguments to
856 # work.
858 # Expand the data IDs to include all dimension metadata; we need this
859 # because we may need to generate path templates that rely on that
860 # metadata.
861 # This is the first step that involves actual database calls (but just
862 # SELECTs), so if there's going to be a problem with connections vs.
863 # multiple processes, or lock contention (in SQLite) slowing things
864 # down, it'll happen here.
865 return mapFunc(self.expandDataIds, exposureData), bad_files
867 def ingestExposureDatasets(
868 self,
869 exposure: RawExposureData,
870 *,
871 run: Optional[str] = None,
872 skip_existing_exposures: bool = False,
873 ) -> List[FileDataset]:
874 """Ingest all raw files in one exposure.
876 Parameters
877 ----------
878 exposure : `RawExposureData`
879 A structure containing information about the exposure to be
880 ingested. Must have `RawExposureData.records` populated and all
881 data ID attributes expanded.
882 run : `str`, optional
883 Name of a RUN-type collection to write to, overriding
884 ``self.butler.run``.
885 skip_existing_exposures : `bool`, optional
886 If `True` (`False` is default), skip raws that have already been
887 ingested (i.e. raws for which we already have a dataset with the
888 same data ID in the target collection, even if from another file).
889 Note that this is much slower than just not passing
890 already-ingested files as inputs, because we still need to read and
891 process metadata to identify which exposures to search for. It
892 also will not work reliably if multiple processes are attempting to
893 ingest raws from the same exposure concurrently, in that different
894 processes may still attempt to ingest the same raw and conflict,
895 causing a failure that prevents other raws from the same exposure
896 from being ingested.
898 Returns
899 -------
900 datasets : `list` of `lsst.daf.butler.FileDataset`
901 Per-file structures identifying the files ingested and their
902 dataset representation in the data repository.
903 """
904 if skip_existing_exposures:
905 existing = {
906 ref.dataId
907 for ref in self.butler.registry.queryDatasets(
908 self.datasetType,
909 collections=[run],
910 dataId=exposure.dataId,
911 )
912 }
913 else:
914 existing = set()
915 datasets = []
916 for file in exposure.files:
917 refs = [DatasetRef(self.datasetType, d.dataId) for d in file.datasets if d.dataId not in existing]
918 if refs:
919 datasets.append(
920 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass)
921 )
923 # Raw files are preferentially ingested using a UUID derived from
924 # the collection name and dataId.
925 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN):
926 mode = DatasetIdGenEnum.DATAID_TYPE_RUN
927 else:
928 mode = DatasetIdGenEnum.UNIQUE
929 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run, idGenerationMode=mode)
930 return datasets
932 def ingestFiles(
933 self,
934 files,
935 *,
936 pool: Optional[Pool] = None,
937 processes: int = 1,
938 run: Optional[str] = None,
939 skip_existing_exposures: bool = False,
940 update_exposure_records: bool = False,
941 ):
942 """Ingest files into a Butler data repository.
944 This creates any new exposure or visit Dimension entries needed to
945 identify the ingested files, creates new Dataset entries in the
946 Registry and finally ingests the files themselves into the Datastore.
947 Any needed instrument, detector, and physical_filter Dimension entries
948 must exist in the Registry before `run` is called.
950 Parameters
951 ----------
952 files : iterable over `lsst.resources.ResourcePath`
953 URIs to the files to be ingested.
954 pool : `multiprocessing.Pool`, optional
955 If not `None`, a process pool with which to parallelize some
956 operations.
957 processes : `int`, optional
958 The number of processes to use. Ignored if ``pool`` is not `None`.
959 run : `str`, optional
960 Name of a RUN-type collection to write to, overriding
961 the default derived from the instrument name.
962 skip_existing_exposures : `bool`, optional
963 If `True` (`False` is default), skip raws that have already been
964 ingested (i.e. raws for which we already have a dataset with the
965 same data ID in the target collection, even if from another file).
966 Note that this is much slower than just not passing
967 already-ingested files as inputs, because we still need to read and
968 process metadata to identify which exposures to search for. It
969 also will not work reliably if multiple processes are attempting to
970 ingest raws from the same exposure concurrently, in that different
971 processes may still attempt to ingest the same raw and conflict,
972 causing a failure that prevents other raws from the same exposure
973 from being ingested.
974 update_exposure_records : `bool`, optional
975 If `True` (`False` is default), update existing exposure records
976 that conflict with the new ones instead of rejecting them. THIS IS
977 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
978 KNOWN TO BE BAD. This should usually be combined with
979 ``skip_existing_exposures=True``.
981 Returns
982 -------
983 refs : `list` of `lsst.daf.butler.DatasetRef`
984 Dataset references for ingested raws.
985 """
987 exposureData, bad_files = self.prep(files, pool=pool, processes=processes)
989 # Up to this point, we haven't modified the data repository at all.
990 # Now we finally do that, with one transaction per exposure. This is
991 # not parallelized at present because the performance of this step is
992 # limited by the database server. That may or may not change in the
993 # future once we increase our usage of bulk inserts and reduce our
994 # usage of savepoints; we've tried to get everything but the database
995 # operations done in advance to reduce the time spent inside
996 # transactions.
997 self.butler.registry.registerDatasetType(self.datasetType)
999 refs = []
1000 runs = set()
1001 n_exposures = 0
1002 n_exposures_failed = 0
1003 n_ingests_failed = 0
1004 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"):
1006 self.log.debug(
1007 "Attempting to ingest %d file%s from exposure %s:%s",
1008 *_log_msg_counter(exposure.files),
1009 exposure.record.instrument,
1010 exposure.record.obs_id,
1011 )
1013 try:
1014 inserted_or_updated = self.butler.registry.syncDimensionData(
1015 "exposure",
1016 exposure.record,
1017 update=update_exposure_records,
1018 )
1019 except Exception as e:
1020 self._on_ingest_failure(exposure, e)
1021 n_exposures_failed += 1
1022 self.log.warning(
1023 "Exposure %s:%s could not be registered: %s",
1024 exposure.record.instrument,
1025 exposure.record.obs_id,
1026 e,
1027 )
1028 if self.config.failFast:
1029 raise e
1030 continue
1032 if isinstance(inserted_or_updated, dict):
1033 # Exposure is in the registry and we updated it, so
1034 # syncDimensionData returned a dict.
1035 self.log.info(
1036 "Exposure %s:%s was already present, but columns %s were updated.",
1037 exposure.record.instrument,
1038 exposure.record.obs_id,
1039 str(list(inserted_or_updated.keys())),
1040 )
1042 # Override default run if nothing specified explicitly.
1043 if run is None:
1044 instrument = exposure.files[0].instrument
1045 this_run = instrument.makeDefaultRawIngestRunName()
1046 else:
1047 this_run = run
1048 if this_run not in runs:
1049 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
1050 runs.add(this_run)
1051 try:
1052 datasets_for_exposure = self.ingestExposureDatasets(
1053 exposure,
1054 run=this_run,
1055 skip_existing_exposures=skip_existing_exposures,
1056 )
1057 except Exception as e:
1058 self._on_ingest_failure(exposure, e)
1059 n_ingests_failed += 1
1060 self.log.warning("Failed to ingest the following for reason: %s", e)
1061 for f in exposure.files:
1062 self.log.warning("- %s", f.filename)
1063 if self.config.failFast:
1064 raise e
1065 continue
1066 else:
1067 self._on_success(datasets_for_exposure)
1068 for dataset in datasets_for_exposure:
1069 refs.extend(dataset.refs)
1071 # Success for this exposure.
1072 n_exposures += 1
1073 self.log.info(
1074 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id
1075 )
1077 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed
1079 @timeMethod
1080 def run(
1081 self,
1082 files,
1083 *,
1084 pool: Optional[Pool] = None,
1085 processes: int = 1,
1086 run: Optional[str] = None,
1087 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b",
1088 group_files: bool = True,
1089 skip_existing_exposures: bool = False,
1090 update_exposure_records: bool = False,
1091 ):
1092 """Ingest files into a Butler data repository.
1094 This creates any new exposure or visit Dimension entries needed to
1095 identify the ingested files, creates new Dataset entries in the
1096 Registry and finally ingests the files themselves into the Datastore.
1097 Any needed instrument, detector, and physical_filter Dimension entries
1098 must exist in the Registry before `run` is called.
1100 Parameters
1101 ----------
1102 files : iterable `lsst.resources.ResourcePath`, `str` or path-like
1103 Paths to the files to be ingested. Can refer to directories.
1104 Will be made absolute if they are not already.
1105 pool : `multiprocessing.Pool`, optional
1106 If not `None`, a process pool with which to parallelize some
1107 operations.
1108 processes : `int`, optional
1109 The number of processes to use. Ignored if ``pool`` is not `None`.
1110 run : `str`, optional
1111 Name of a RUN-type collection to write to, overriding
1112 the default derived from the instrument name.
1113 file_filter : `str` or `re.Pattern`, optional
1114 Pattern to use to discover files to ingest within directories.
1115 The default is to search for FITS files. The regex applies to
1116 files within the directory.
1117 group_files : `bool`, optional
1118 Group files by directory if they have been discovered in
1119 directories. Will not affect files explicitly provided.
1120 skip_existing_exposures : `bool`, optional
1121 If `True` (`False` is default), skip raws that have already been
1122 ingested (i.e. raws for which we already have a dataset with the
1123 same data ID in the target collection, even if from another file).
1124 Note that this is much slower than just not passing
1125 already-ingested files as inputs, because we still need to read and
1126 process metadata to identify which exposures to search for. It
1127 also will not work reliably if multiple processes are attempting to
1128 ingest raws from the same exposure concurrently, in that different
1129 processes may still attempt to ingest the same raw and conflict,
1130 causing a failure that prevents other raws from the same exposure
1131 from being ingested.
1132 update_exposure_records : `bool`, optional
1133 If `True` (`False` is default), update existing exposure records
1134 that conflict with the new ones instead of rejecting them. THIS IS
1135 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS
1136 KNOWN TO BE BAD. This should usually be combined with
1137 ``skip_existing_exposures=True``.
1139 Returns
1140 -------
1141 refs : `list` of `lsst.daf.butler.DatasetRef`
1142 Dataset references for ingested raws.
1144 Notes
1145 -----
1146 This method inserts all datasets for an exposure within a transaction,
1147 guaranteeing that partial exposures are never ingested. The exposure
1148 dimension record is inserted with `Registry.syncDimensionData` first
1149 (in its own transaction), which inserts only if a record with the same
1150 primary key does not already exist. This allows different files within
1151 the same exposure to be ingested in different runs.
1152 """
1154 refs = []
1155 bad_files = []
1156 n_exposures = 0
1157 n_exposures_failed = 0
1158 n_ingests_failed = 0
1159 if group_files:
1160 for group in ResourcePath.findFileResources(files, file_filter, group_files):
1161 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(
1162 group,
1163 pool=pool,
1164 processes=processes,
1165 run=run,
1166 skip_existing_exposures=skip_existing_exposures,
1167 update_exposure_records=update_exposure_records,
1168 )
1169 refs.extend(new_refs)
1170 bad_files.extend(bad)
1171 n_exposures += n_exp
1172 n_exposures_failed += n_exp_fail
1173 n_ingests_failed += n_ingest_fail
1174 else:
1175 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles(
1176 ResourcePath.findFileResources(files, file_filter, group_files),
1177 pool=pool,
1178 processes=processes,
1179 run=run,
1180 skip_existing_exposures=skip_existing_exposures,
1181 update_exposure_records=update_exposure_records,
1182 )
1184 had_failure = False
1186 if bad_files:
1187 had_failure = True
1188 self.log.warning("Could not extract observation metadata from the following:")
1189 for f in bad_files:
1190 self.log.warning("- %s", f)
1192 self.log.info(
1193 "Successfully processed data from %d exposure%s with %d failure%s from exposure"
1194 " registration and %d failure%s from file ingest.",
1195 *_log_msg_counter(n_exposures),
1196 *_log_msg_counter(n_exposures_failed),
1197 *_log_msg_counter(n_ingests_failed),
1198 )
1199 if n_exposures_failed > 0 or n_ingests_failed > 0:
1200 had_failure = True
1201 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs))
1203 if had_failure:
1204 raise RuntimeError("Some failures encountered during ingestion")
1206 return refs