Coverage for python/lsst/obs/base/ingest.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import os.path
26import itertools
27from dataclasses import dataclass
28from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29from collections import defaultdict
30from multiprocessing import Pool
32from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33from lsst.utils import doImport
34from lsst.afw.fits import readMetadata
35from lsst.daf.butler import (
36 Butler,
37 DataCoordinate,
38 DatasetRef,
39 DatasetType,
40 DimensionRecord,
41 FileDataset,
42)
43from lsst.geom import Box2D
44from lsst.pex.config import Config, Field, ChoiceField
45from lsst.pipe.base import Task
46from lsst.sphgeom import ConvexPolygon
48from .instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
49from .fitsRawFormatterBase import FitsRawFormatterBase
52@dataclass
53class RawFileDatasetInfo:
54 """Structure that hold information about a single dataset within a
55 raw file.
56 """
58 dataId: DataCoordinate
59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63 """
65 obsInfo: ObservationInfo
66 """Standardized observation metadata extracted directly from the file
67 headers (`astro_metadata_translator.ObservationInfo`).
68 """
70 region: ConvexPolygon
71 """Region on the sky covered by this file, possibly with padding
72 (`lsst.sphgeom.ConvexPolygon`).
73 """
76@dataclass
77class RawFileData:
78 """Structure that holds information about a single raw file, used during
79 ingest.
80 """
82 datasets: List[RawFileDatasetInfo]
83 """The information describing each dataset within this raw file.
84 (`list` of `RawFileDatasetInfo`)
85 """
87 filename: str
88 """Name of the file this information was extracted from (`str`).
90 This is the path prior to ingest, not the path after ingest.
91 """
93 FormatterClass: Type[FitsRawFormatterBase]
94 """Formatter class that should be used to ingest this file and compute
95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
96 """
99@dataclass
100class RawExposureData:
101 """Structure that holds information about a complete raw exposure, used
102 during ingest.
103 """
105 dataId: DataCoordinate
106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
110 """
112 files: List[RawFileData]
113 """List of structures containing file-level information.
114 """
116 records: Optional[Dict[str, List[DimensionRecord]]] = None
117 """Dictionary containing `DimensionRecord` instances that must be inserted
118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
120 Keys are the names of dimension elements ("exposure" and optionally "visit"
121 and "visit_detector_region"), while values are lists of `DimensionRecord`.
123 May be `None` during some ingest steps.
124 """
127def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
128 """Create a Config field with options for how to transfer files between
129 data repositories.
131 The allowed options for the field are exactly those supported by
132 `lsst.daf.butler.Datastore.ingest`.
134 Parameters
135 ----------
136 doc : `str`
137 Documentation for the configuration field.
139 Returns
140 -------
141 field : `lsst.pex.config.ChoiceField`
142 Configuration field.
143 """
144 return ChoiceField(
145 doc=doc,
146 dtype=str,
147 allowed={"move": "move",
148 "copy": "copy",
149 "auto": "choice will depend on datastore",
150 "link": "hard link falling back to symbolic link",
151 "hardlink": "hard link",
152 "symlink": "symbolic (soft) link"},
153 optional=True,
154 default=default
155 )
158class RawIngestConfig(Config):
159 transfer = makeTransferChoiceField()
160 padRegionAmount = Field(
161 dtype=int,
162 default=0,
163 doc="Pad an image with specified number of pixels before calculating region"
164 )
165 instrument = Field(
166 doc=("Fully-qualified Python name of the `Instrument` subclass to "
167 "associate with all raws."),
168 dtype=str,
169 optional=False,
170 default=None,
171 )
174class RawIngestTask(Task):
175 """Driver Task for ingesting raw data into Gen3 Butler repositories.
177 This Task is intended to be runnable from the command-line, but it doesn't
178 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
179 gain much from being one. It also wouldn't really be appropriate as a
180 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
181 leverage the logging and configurability functionality that provides.
183 Each instance of `RawIngestTask` writes to the same Butler. Each
184 invocation of `RawIngestTask.run` ingests a list of files.
186 Parameters
187 ----------
188 config : `RawIngestConfig`
189 Configuration for the task.
190 butler : `~lsst.daf.butler.Butler`
191 Butler instance. Ingested Datasets will be created as part of
192 ``butler.run`` and associated with its Collection.
193 kwds
194 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
195 constructor.
197 Other keyword arguments are forwarded to the Task base class constructor.
198 """
200 ConfigClass = RawIngestConfig
202 _DefaultName = "ingest"
204 def getDatasetType(self):
205 """Return the DatasetType of the Datasets ingested by this Task.
206 """
207 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
208 universe=self.butler.registry.dimensions)
210 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
211 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
212 super().__init__(config, **kwds)
213 self.butler = butler
214 self.universe = self.butler.registry.dimensions
215 self.instrument = doImport(self.config.instrument)()
216 # For now, we get a nominal Camera from the Instrument.
217 # In the future, we may want to load one from a Butler calibration
218 # collection that's appropriate for the observation timestamp of
219 # the exposure.
220 self.camera = self.instrument.getCamera()
221 self.datasetType = self.getDatasetType()
223 def extractMetadata(self, filename: str) -> RawFileData:
224 """Extract and process metadata from a single raw file.
226 Parameters
227 ----------
228 filename : `str`
229 Path to the file.
231 Returns
232 -------
233 data : `RawFileData`
234 A structure containing the metadata extracted from the file,
235 as well as the original filename. All fields will be populated,
236 but the `RawFileData.dataId` attribute will be a minimal
237 (unexpanded) `DataCoordinate` instance.
239 Notes
240 -----
241 Assumes that there is a single dataset associated with the given
242 file. Instruments using a single file to store multiple datasets
243 must implement their own version of this method.
244 """
245 # Manually merge the primary and "first data" headers here because we
246 # do not know in general if an input file has set INHERIT=T.
247 phdu = readMetadata(filename, 0)
248 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
249 fix_header(header)
250 datasets = [self._calculate_dataset_info(header, filename)]
252 # The data model currently assumes that whilst multiple datasets
253 # can be associated with a single file, they must all share the
254 # same formatter.
255 FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId)
257 return RawFileData(datasets=datasets, filename=filename,
258 FormatterClass=FormatterClass)
260 def _calculate_dataset_info(self, header, filename):
261 """Calculate a RawFileDatasetInfo from the supplied information.
263 Parameters
264 ----------
265 header : `Mapping`
266 Header from the dataset.
267 filename : `str`
268 Filename to use for error messages.
270 Returns
271 -------
272 dataset : `RawFileDatasetInfo`
273 The region, dataId, and observation information associated with
274 this dataset.
275 """
276 obsInfo = ObservationInfo(header)
277 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
278 exposure=obsInfo.exposure_id,
279 detector=obsInfo.detector_num,
280 universe=self.universe)
281 if obsInfo.instrument != self.instrument.getName():
282 raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
283 f"got {obsInfo.instrument}) for file {filename}.")
285 FormatterClass = self.instrument.getRawFormatter(dataId)
286 region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass)
287 return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)
289 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
290 """Calculate the sky region covered by the supplied observation
291 information.
293 Parameters
294 ----------
295 obsInfo : `~astro_metadata_translator.ObservationInfo`
296 Summary information of this dataset.
297 header : `Mapping`
298 Header from the dataset.
299 FormatterClass: `type` as subclass of `FitsRawFormatterBase`
300 Formatter class that should be used to compute the spatial region.
302 Returns
303 -------
304 region : `lsst.sphgeom.ConvexPolygon`
305 Region of sky covered by this observation.
306 """
307 if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
308 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
309 visitInfo = formatter.makeVisitInfo()
310 detector = self.camera[obsInfo.detector_num]
311 wcs = formatter.makeWcs(visitInfo, detector)
312 pixBox = Box2D(detector.getBBox())
313 if self.config.padRegionAmount > 0:
314 pixBox.grow(self.config.padRegionAmount)
315 pixCorners = pixBox.getCorners()
316 sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
317 region = ConvexPolygon(sphCorners)
318 else:
319 region = None
320 return region
322 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
323 """Group an iterable of `RawFileData` by exposure.
325 Parameters
326 ----------
327 files : iterable of `RawFileData`
328 File-level information to group.
330 Returns
331 -------
332 exposures : `list` of `RawExposureData`
333 A list of structures that group the file-level information by
334 exposure. The `RawExposureData.records` attributes of elements
335 will be `None`, but all other fields will be populated. The
336 `RawExposureData.dataId` attributes will be minimal (unexpanded)
337 `DataCoordinate` instances.
338 """
339 exposureDimensions = self.universe["exposure"].graph
340 byExposure = defaultdict(list)
341 for f in files:
342 # Assume that the first dataset is representative for the file
343 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
345 return [RawExposureData(dataId=dataId, files=exposureFiles)
346 for dataId, exposureFiles in byExposure.items()]
348 def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
349 """Collect the `DimensionRecord` instances that must be inserted into
350 the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
352 Parameters
353 ----------
354 exposure : `RawExposureData`
355 A structure containing information about the exposure to be
356 ingested. Should be considered consumed upon return.
358 Returns
359 -------
360 exposure : `RawExposureData`
361 An updated version of the input structure, with
362 `RawExposureData.records` populated.
363 """
364 firstFile = exposure.files[0]
365 firstDataset = firstFile.datasets[0]
366 VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
367 exposure.records = {
368 "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)],
369 }
370 if firstDataset.obsInfo.visit_id is not None:
371 exposure.records["visit_detector_region"] = []
372 visitVertices = []
373 for file in exposure.files:
374 for dataset in file.datasets:
375 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
376 raise ValueError(f"Inconsistent visit/exposure relationship for "
377 f"exposure {firstDataset.obsInfo.exposure_id} between "
378 f"{file.filename} and {firstFile.filename}: "
379 f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
380 if dataset.region is None:
381 self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
382 dataset.obsInfo.detector_num)
383 continue
384 visitVertices.extend(dataset.region.getVertices())
385 exposure.records["visit_detector_region"].append(
386 VisitDetectorRegionRecordClass.fromDict({
387 "instrument": dataset.obsInfo.instrument,
388 "visit": dataset.obsInfo.visit_id,
389 "detector": dataset.obsInfo.detector_num,
390 "region": dataset.region,
391 })
392 )
393 if visitVertices:
394 visitRegion = ConvexPolygon(visitVertices)
395 else:
396 self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id)
397 visitRegion = None
398 exposure.records["visit"] = [
399 makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion)
400 ]
401 return exposure
403 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
404 """Expand the data IDs associated with a raw exposure to include
405 additional metadata records.
407 Parameters
408 ----------
409 exposure : `RawExposureData`
410 A structure containing information about the exposure to be
411 ingested. Must have `RawExposureData.records` populated. Should
412 be considered consumed upon return.
414 Returns
415 -------
416 exposure : `RawExposureData`
417 An updated version of the input structure, with
418 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
419 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
420 """
421 hasVisit = "visit" in data.records
422 # We start by expanded the exposure-level data ID; we won't use that
423 # directly in file ingest, but this lets us do some database lookups
424 # once per exposure instead of once per file later.
425 data.dataId = self.butler.registry.expandDataId(
426 data.dataId,
427 # We pass in the records we'll be inserting shortly so they aren't
428 # looked up from the database. We do expect instrument and filter
429 # records to be retrieved from the database here (though the
430 # Registry may cache them so there isn't a lookup every time).
431 records={
432 "exposure": data.records["exposure"][0],
433 "visit": data.records["visit"][0] if hasVisit else None,
434 }
435 )
436 # Now we expand the per-file (exposure+detector) data IDs. This time
437 # we pass in the records we just retrieved from the exposure data ID
438 # expansion as well as the visit_detector_region record, if there is
439 # one.
440 vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
441 for file, vdrRecord in zip(data.files, vdrRecords):
442 for dataset in file.datasets:
443 dataset.dataId = self.butler.registry.expandDataId(
444 dataset.dataId,
445 records=dict(data.dataId.records, visit_detector_region=vdrRecord)
446 )
447 return data
449 def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
450 """Perform all ingest preprocessing steps that do not involve actually
451 modifying the database.
453 Parameters
454 ----------
455 files : iterable over `str` or path-like objects
456 Paths to the files to be ingested. Will be made absolute
457 if they are not already.
458 pool : `multiprocessing.Pool`, optional
459 If not `None`, a process pool with which to parallelize some
460 operations.
461 processes : `int`, optional
462 The number of processes to use. Ignored if ``pool`` is not `None`.
464 Yields
465 ------
466 exposure : `RawExposureData`
467 Data structures containing dimension records, filenames, and data
468 IDs to be ingested (one structure for each exposure).
469 """
470 if pool is None and processes > 1:
471 pool = Pool(processes)
472 mapFunc = map if pool is None else pool.imap_unordered
474 # Extract metadata and build per-detector regions.
475 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
477 # Use that metadata to group files (and extracted metadata) by
478 # exposure. Never parallelized because it's intrinsically a gather
479 # step.
480 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
482 # The next few operations operate on RawExposureData instances (one at
483 # a time) in-place and then return the modified instance. We call them
484 # as pass-throughs instead of relying on the arguments we pass in to
485 # have been modified because in the parallel case those arguments are
486 # going to be pickled and unpickled, and I'm not certain
487 # multiprocessing is careful enough with that for output arguments to
488 # work. We use the same variable names to reflect the fact that we
489 # consider the arguments to have been consumed/invalidated.
491 # Extract DimensionRecords from the metadata that will need to be
492 # inserted into the Registry before the raw datasets themselves are
493 # ingested.
494 exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
496 # Expand the data IDs to include all dimension metadata; we need this
497 # because we may need to generate path templates that rely on that
498 # metadata.
499 # This is the first step that involves actual database calls (but just
500 # SELECTs), so if there's going to be a problem with connections vs.
501 # multiple processes, or lock contention (in SQLite) slowing things
502 # down, it'll happen here.
503 return mapFunc(self.expandDataIds, exposureData)
505 def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
506 """Insert dimension records for one or more exposures.
508 Parameters
509 ----------
510 records : `dict` mapping `str` to `list`
511 Dimension records to be inserted, organized as a mapping from
512 dimension name to a list of records for that dimension. This
513 may be a single `RawExposureData.records` dict, or an aggregate
514 for multiple exposures created by concatenating the value lists
515 of those dictionaries.
517 Returns
518 -------
519 refs : `list` of `lsst.daf.butler.DatasetRef`
520 Dataset references for ingested raws.
521 """
522 # TODO: This currently assumes that either duplicate inserts of
523 # visit records are ignored, or there is exactly one visit per
524 # exposure. I expect us to switch up the visit-exposure
525 # relationship and hence rewrite some of this code before that
526 # becomes a practical problem.
527 # Iterate over dimensions explicitly to order for foreign key
528 # relationships.
529 for dimension in ("visit", "exposure", "visit_detector_region"):
530 recordsForDimension = records.get(dimension)
531 if recordsForDimension:
532 # TODO: once Registry has options to ignore or replace
533 # existing dimension records with the same primary keys
534 # instead of aborting on conflicts, add configuration
535 # options and logic to use them.
536 self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
538 def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
539 ) -> List[DatasetRef]:
540 """Ingest all raw files in one exposure.
542 Parameters
543 ----------
544 exposure : `RawExposureData`
545 A structure containing information about the exposure to be
546 ingested. Must have `RawExposureData.records` populated and all
547 data ID attributes expanded.
548 butler : `lsst.daf.butler.Butler`, optional
549 Butler to use for ingest. If not provided, ``self.butler`` will
550 be used.
552 Returns
553 -------
554 refs : `list` of `lsst.daf.butler.DatasetRef`
555 Dataset references for ingested raws.
556 """
557 if butler is None:
558 butler = self.butler
559 datasets = [FileDataset(path=os.path.abspath(file.filename),
560 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
561 formatter=file.FormatterClass)
562 for file in exposure.files]
563 butler.ingest(*datasets, transfer=self.config.transfer)
564 return [ref for dataset in datasets for ref in dataset.refs]
566 def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
567 """Ingest files into a Butler data repository.
569 This creates any new exposure or visit Dimension entries needed to
570 identify the ingested files, creates new Dataset entries in the
571 Registry and finally ingests the files themselves into the Datastore.
572 Any needed instrument, detector, and physical_filter Dimension entries
573 must exist in the Registry before `run` is called.
575 Parameters
576 ----------
577 files : iterable over `str` or path-like objects
578 Paths to the files to be ingested. Will be made absolute
579 if they are not already.
580 pool : `multiprocessing.Pool`, optional
581 If not `None`, a process pool with which to parallelize some
582 operations.
583 processes : `int`, optional
584 The number of processes to use. Ignored if ``pool`` is not `None`.
586 Returns
587 -------
588 refs : `list` of `lsst.daf.butler.DatasetRef`
589 Dataset references for ingested raws.
591 Notes
592 -----
593 This method inserts all records (dimensions and datasets) for an
594 exposure within a transaction, guaranteeing that partial exposures
595 are never ingested.
596 """
597 exposureData = self.prep(files, pool=pool, processes=processes)
598 # Up to this point, we haven't modified the data repository at all.
599 # Now we finally do that, with one transaction per exposure. This is
600 # not parallelized at present because the performance of this step is
601 # limited by the database server. That may or may not change in the
602 # future once we increase our usage of bulk inserts and reduce our
603 # usage of savepoints; we've tried to get everything but the database
604 # operations done in advance to reduce the time spent inside
605 # transactions.
606 self.butler.registry.registerDatasetType(self.datasetType)
607 refs = []
608 for exposure in exposureData:
609 with self.butler.transaction():
610 self.insertDimensionData(exposure.records)
611 refs.extend(self.ingestExposureDatasets(exposure))
612 return refs