Coverage for python/lsst/obs/base/ingest.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import os.path
26import itertools
27from dataclasses import dataclass
28from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29from collections import defaultdict
30from multiprocessing import Pool
32from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33from lsst.utils import doImport
34from lsst.afw.fits import readMetadata
35from lsst.daf.butler import (
36 Butler,
37 DataCoordinate,
38 DatasetRef,
39 DatasetType,
40 DimensionRecord,
41 FileDataset,
42)
43from lsst.obs.base.instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
44from lsst.geom import Box2D
45from lsst.pex.config import Config, Field, ChoiceField
46from lsst.pipe.base import Task
47from lsst.sphgeom import ConvexPolygon
49from .fitsRawFormatterBase import FitsRawFormatterBase
52@dataclass
53class RawFileDatasetInfo:
54 """Structure that hold information about a single dataset within a
55 raw file.
56 """
58 dataId: DataCoordinate
59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63 """
65 obsInfo: ObservationInfo
66 """Standardized observation metadata extracted directly from the file
67 headers (`astro_metadata_translator.ObservationInfo`).
68 """
70 region: ConvexPolygon
71 """Region on the sky covered by this file, possibly with padding
72 (`lsst.sphgeom.ConvexPolygon`).
73 """
76@dataclass
77class RawFileData:
78 """Structure that holds information about a single raw file, used during
79 ingest.
80 """
82 datasets: List[RawFileDatasetInfo]
83 """The information describing each dataset within this raw file.
84 (`list` of `RawFileDatasetInfo`)
85 """
87 filename: str
88 """Name of the file this information was extracted from (`str`).
90 This is the path prior to ingest, not the path after ingest.
91 """
93 FormatterClass: Type[FitsRawFormatterBase]
94 """Formatter class that should be used to ingest this file and compute
95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
96 """
99@dataclass
100class RawExposureData:
101 """Structure that holds information about a complete raw exposure, used
102 during ingest.
103 """
105 dataId: DataCoordinate
106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
110 """
112 files: List[RawFileData]
113 """List of structures containing file-level information.
114 """
116 records: Optional[Dict[str, List[DimensionRecord]]] = None
117 """Dictionary containing `DimensionRecord` instances that must be inserted
118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
120 Keys are the names of dimension elements ("exposure" and optionally "visit"
121 and "visit_detector_region"), while values are lists of `DimensionRecord`.
123 May be `None` during some ingest steps.
124 """
127def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
128 """Create a Config field with options for how to transfer files between
129 data repositories.
131 The allowed options for the field are exactly those supported by
132 `lsst.daf.butler.Datastore.ingest`.
134 Parameters
135 ----------
136 doc : `str`
137 Documentation for the configuration field.
139 Returns
140 -------
141 field : `lsst.pex.config.ChoiceField`
142 Configuration field.
143 """
144 return ChoiceField(
145 doc=doc,
146 dtype=str,
147 allowed={"move": "move",
148 "copy": "copy",
149 "hardlink": "hard link",
150 "symlink": "symbolic (soft) link"},
151 optional=True,
152 default=default
153 )
156class RawIngestConfig(Config):
157 transfer = makeTransferChoiceField()
158 padRegionAmount = Field(
159 dtype=int,
160 default=0,
161 doc="Pad an image with specified number of pixels before calculating region"
162 )
163 instrument = Field(
164 doc=("Fully-qualified Python name of the `Instrument` subclass to "
165 "associate with all raws."),
166 dtype=str,
167 optional=False,
168 default=None,
169 )
172class RawIngestTask(Task):
173 """Driver Task for ingesting raw data into Gen3 Butler repositories.
175 This Task is intended to be runnable from the command-line, but it doesn't
176 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
177 gain much from being one. It also wouldn't really be appropriate as a
178 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
179 leverage the logging and configurability functionality that provides.
181 Each instance of `RawIngestTask` writes to the same Butler. Each
182 invocation of `RawIngestTask.run` ingests a list of files.
184 Parameters
185 ----------
186 config : `RawIngestConfig`
187 Configuration for the task.
188 butler : `~lsst.daf.butler.Butler`
189 Butler instance. Ingested Datasets will be created as part of
190 ``butler.run`` and associated with its Collection.
191 kwds
192 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
193 constructor.
195 Other keyword arguments are forwarded to the Task base class constructor.
196 """
198 ConfigClass = RawIngestConfig
200 _DefaultName = "ingest"
202 def getDatasetType(self):
203 """Return the DatasetType of the Datasets ingested by this Task.
204 """
205 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
206 universe=self.butler.registry.dimensions)
208 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
209 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
210 super().__init__(config, **kwds)
211 self.butler = butler
212 self.universe = self.butler.registry.dimensions
213 self.instrument = doImport(self.config.instrument)()
214 # For now, we get a nominal Camera from the Instrument.
215 # In the future, we may want to load one from a Butler calibration
216 # collection that's appropriate for the observation timestamp of
217 # the exposure.
218 self.camera = self.instrument.getCamera()
219 self.datasetType = self.getDatasetType()
221 def extractMetadata(self, filename: str) -> RawFileData:
222 """Extract and process metadata from a single raw file.
224 Parameters
225 ----------
226 filename : `str`
227 Path to the file.
229 Returns
230 -------
231 data : `RawFileData`
232 A structure containing the metadata extracted from the file,
233 as well as the original filename. All fields will be populated,
234 but the `RawFileData.dataId` attribute will be a minimal
235 (unexpanded) `DataCoordinate` instance.
237 Notes
238 -----
239 Assumes that there is a single dataset associated with the given
240 file. Instruments using a single file to store multiple datasets
241 must implement their own version of this method.
242 """
243 # Manually merge the primary and "first data" headers here because we
244 # do not know in general if an input file has set INHERIT=T.
245 phdu = readMetadata(filename, 0)
246 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
247 fix_header(header)
248 datasets = [self._calculate_dataset_info(header, filename)]
250 # The data model currently assumes that whilst multiple datasets
251 # can be associated with a single file, they must all share the
252 # same formatter.
253 FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId)
255 return RawFileData(datasets=datasets, filename=filename,
256 FormatterClass=FormatterClass)
258 def _calculate_dataset_info(self, header, filename):
259 """Calculate a RawFileDatasetInfo from the supplied information.
261 Parameters
262 ----------
263 header : `Mapping`
264 Header from the dataset.
265 filename : `str`
266 Filename to use for error messages.
268 Returns
269 -------
270 dataset : `RawFileDatasetInfo`
271 The region, dataId, and observation information associated with
272 this dataset.
273 """
274 obsInfo = ObservationInfo(header)
275 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
276 exposure=obsInfo.exposure_id,
277 detector=obsInfo.detector_num,
278 universe=self.universe)
279 if obsInfo.instrument != self.instrument.getName():
280 raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
281 f"got {obsInfo.instrument}) for file {filename}.")
283 FormatterClass = self.instrument.getRawFormatter(dataId)
284 region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass)
285 return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)
287 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
288 """Calculate the sky region covered by the supplied observation
289 information.
291 Parameters
292 ----------
293 obsInfo : `~astro_metadata_translator.ObservationInfo`
294 Summary information of this dataset.
295 header : `Mapping`
296 Header from the dataset.
297 FormatterClass: `type` as subclass of `FitsRawFormatterBase`
298 Formatter class that should be used to compute the spatial region.
300 Returns
301 -------
302 region : `lsst.sphgeom.ConvexPolygon`
303 Region of sky covered by this observation.
304 """
305 if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
306 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
307 visitInfo = formatter.makeVisitInfo()
308 detector = self.camera[obsInfo.detector_num]
309 wcs = formatter.makeWcs(visitInfo, detector)
310 pixBox = Box2D(detector.getBBox())
311 if self.config.padRegionAmount > 0:
312 pixBox.grow(self.config.padRegionAmount)
313 pixCorners = pixBox.getCorners()
314 sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
315 region = ConvexPolygon(sphCorners)
316 else:
317 region = None
318 return region
320 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
321 """Group an iterable of `RawFileData` by exposure.
323 Parameters
324 ----------
325 files : iterable of `RawFileData`
326 File-level information to group.
328 Returns
329 -------
330 exposures : `list` of `RawExposureData`
331 A list of structures that group the file-level information by
332 exposure. The `RawExposureData.records` attributes of elements
333 will be `None`, but all other fields will be populated. The
334 `RawExposureData.dataId` attributes will be minimal (unexpanded)
335 `DataCoordinate` instances.
336 """
337 exposureDimensions = self.universe["exposure"].graph
338 byExposure = defaultdict(list)
339 for f in files:
340 # Assume that the first dataset is representative for the file
341 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
343 return [RawExposureData(dataId=dataId, files=exposureFiles)
344 for dataId, exposureFiles in byExposure.items()]
346 def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
347 """Collect the `DimensionRecord` instances that must be inserted into
348 the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
350 Parameters
351 ----------
352 exposure : `RawExposureData`
353 A structure containing information about the exposure to be
354 ingested. Should be considered consumed upon return.
356 Returns
357 -------
358 exposure : `RawExposureData`
359 An updated version of the input structure, with
360 `RawExposureData.records` populated.
361 """
362 firstFile = exposure.files[0]
363 firstDataset = firstFile.datasets[0]
364 VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
365 exposure.records = {
366 "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)],
367 }
368 if firstDataset.obsInfo.visit_id is not None:
369 exposure.records["visit_detector_region"] = []
370 visitVertices = []
371 for file in exposure.files:
372 for dataset in file.datasets:
373 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
374 raise ValueError(f"Inconsistent visit/exposure relationship for "
375 f"exposure {firstDataset.obsInfo.exposure_id} between "
376 f"{file.filename} and {firstFile.filename}: "
377 f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
378 if dataset.region is None:
379 self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
380 dataset.obsInfo.detector_num)
381 continue
382 visitVertices.extend(dataset.region.getVertices())
383 exposure.records["visit_detector_region"].append(
384 VisitDetectorRegionRecordClass.fromDict({
385 "instrument": dataset.obsInfo.instrument,
386 "visit": dataset.obsInfo.visit_id,
387 "detector": dataset.obsInfo.detector_num,
388 "region": dataset.region,
389 })
390 )
391 if visitVertices:
392 visitRegion = ConvexPolygon(visitVertices)
393 else:
394 self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id)
395 visitRegion = None
396 exposure.records["visit"] = [
397 makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion)
398 ]
399 return exposure
401 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
402 """Expand the data IDs associated with a raw exposure to include
403 additional metadata records.
405 Parameters
406 ----------
407 exposure : `RawExposureData`
408 A structure containing information about the exposure to be
409 ingested. Must have `RawExposureData.records` populated. Should
410 be considered consumed upon return.
412 Returns
413 -------
414 exposure : `RawExposureData`
415 An updated version of the input structure, with
416 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
417 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
418 """
419 hasVisit = "visit" in data.records
420 # We start by expanded the exposure-level data ID; we won't use that
421 # directly in file ingest, but this lets us do some database lookups
422 # once per exposure instead of once per file later.
423 data.dataId = self.butler.registry.expandDataId(
424 data.dataId,
425 # We pass in the records we'll be inserting shortly so they aren't
426 # looked up from the database. We do expect instrument and filter
427 # records to be retrieved from the database here (though the
428 # Registry may cache them so there isn't a lookup every time).
429 records={
430 "exposure": data.records["exposure"][0],
431 "visit": data.records["visit"][0] if hasVisit else None,
432 }
433 )
434 # Now we expand the per-file (exposure+detector) data IDs. This time
435 # we pass in the records we just retrieved from the exposure data ID
436 # expansion as well as the visit_detector_region record, if there is
437 # one.
438 vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
439 for file, vdrRecord in zip(data.files, vdrRecords):
440 for dataset in file.datasets:
441 dataset.dataId = self.butler.registry.expandDataId(
442 dataset.dataId,
443 records=dict(data.dataId.records, visit_detector_region=vdrRecord)
444 )
445 return data
447 def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
448 """Perform all ingest preprocessing steps that do not involve actually
449 modifying the database.
451 Parameters
452 ----------
453 files : iterable over `str` or path-like objects
454 Paths to the files to be ingested. Will be made absolute
455 if they are not already.
456 pool : `multiprocessing.Pool`, optional
457 If not `None`, a process pool with which to parallelize some
458 operations.
459 processes : `int`, optional
460 The number of processes to use. Ignored if ``pool`` is not `None`.
462 Yields
463 ------
464 exposure : `RawExposureData`
465 Data structures containing dimension records, filenames, and data
466 IDs to be ingested (one structure for each exposure).
467 """
468 if pool is None and processes > 1:
469 pool = Pool(processes)
470 mapFunc = map if pool is None else pool.imap_unordered
472 # Extract metadata and build per-detector regions.
473 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
475 # Use that metadata to group files (and extracted metadata) by
476 # exposure. Never parallelized because it's intrinsically a gather
477 # step.
478 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
480 # The next few operations operate on RawExposureData instances (one at
481 # a time) in-place and then return the modified instance. We call them
482 # as pass-throughs instead of relying on the arguments we pass in to
483 # have been modified because in the parallel case those arguments are
484 # going to be pickled and unpickled, and I'm not certain
485 # multiprocessing is careful enough with that for output arguments to
486 # work. We use the same variable names to reflect the fact that we
487 # consider the arguments to have been consumed/invalidated.
489 # Extract DimensionRecords from the metadata that will need to be
490 # inserted into the Registry before the raw datasets themselves are
491 # ingested.
492 exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
494 # Expand the data IDs to include all dimension metadata; we need this
495 # because we may need to generate path templates that rely on that
496 # metadata.
497 # This is the first step that involves actual database calls (but just
498 # SELECTs), so if there's going to be a problem with connections vs.
499 # multiple processes, or lock contention (in SQLite) slowing things
500 # down, it'll happen here.
501 return mapFunc(self.expandDataIds, exposureData)
503 def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
504 """Insert dimension records for one or more exposures.
506 Parameters
507 ----------
508 records : `dict` mapping `str` to `list`
509 Dimension records to be inserted, organized as a mapping from
510 dimension name to a list of records for that dimension. This
511 may be a single `RawExposureData.records` dict, or an aggregate
512 for multiple exposures created by concatenating the value lists
513 of those dictionaries.
515 Returns
516 -------
517 refs : `list` of `lsst.daf.butler.DatasetRef`
518 Dataset references for ingested raws.
519 """
520 # TODO: This currently assumes that either duplicate inserts of
521 # visit records are ignored, or there is exactly one visit per
522 # exposure. I expect us to switch up the visit-exposure
523 # relationship and hence rewrite some of this code before that
524 # becomes a practical problem.
525 # Iterate over dimensions explicitly to order for foreign key
526 # relationships.
527 for dimension in ("visit", "exposure", "visit_detector_region"):
528 recordsForDimension = records.get(dimension)
529 if recordsForDimension:
530 # TODO: once Registry has options to ignore or replace
531 # existing dimension records with the same primary keys
532 # instead of aborting on conflicts, add configuration
533 # options and logic to use them.
534 self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
536 def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
537 ) -> List[DatasetRef]:
538 """Ingest all raw files in one exposure.
540 Parameters
541 ----------
542 exposure : `RawExposureData`
543 A structure containing information about the exposure to be
544 ingested. Must have `RawExposureData.records` populated and all
545 data ID attributes expanded.
546 butler : `lsst.daf.butler.Butler`, optional
547 Butler to use for ingest. If not provided, ``self.butler`` will
548 be used.
550 Returns
551 -------
552 refs : `list` of `lsst.daf.butler.DatasetRef`
553 Dataset references for ingested raws.
554 """
555 if butler is None:
556 butler = self.butler
557 datasets = [FileDataset(path=os.path.abspath(file.filename),
558 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
559 formatter=file.FormatterClass)
560 for file in exposure.files]
561 butler.ingest(*datasets, transfer=self.config.transfer)
562 return [ref for dataset in datasets for ref in dataset.refs]
564 def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
565 """Ingest files into a Butler data repository.
567 This creates any new exposure or visit Dimension entries needed to
568 identify the ingested files, creates new Dataset entries in the
569 Registry and finally ingests the files themselves into the Datastore.
570 Any needed instrument, detector, and physical_filter Dimension entries
571 must exist in the Registry before `run` is called.
573 Parameters
574 ----------
575 files : iterable over `str` or path-like objects
576 Paths to the files to be ingested. Will be made absolute
577 if they are not already.
578 pool : `multiprocessing.Pool`, optional
579 If not `None`, a process pool with which to parallelize some
580 operations.
581 processes : `int`, optional
582 The number of processes to use. Ignored if ``pool`` is not `None`.
584 Returns
585 -------
586 refs : `list` of `lsst.daf.butler.DatasetRef`
587 Dataset references for ingested raws.
589 Notes
590 -----
591 This method inserts all records (dimensions and datasets) for an
592 exposure within a transaction, guaranteeing that partial exposures
593 are never ingested.
594 """
595 exposureData = self.prep(files, pool=pool, processes=processes)
596 # Up to this point, we haven't modified the data repository at all.
597 # Now we finally do that, with one transaction per exposure. This is
598 # not parallelized at present because the performance of this step is
599 # limited by the database server. That may or may not change in the
600 # future once we increase our usage of bulk inserts and reduce our
601 # usage of savepoints; we've tried to get everything but the database
602 # operations done in advance to reduce the time spent inside
603 # transactions.
604 self.butler.registry.registerDatasetType(self.datasetType)
605 refs = []
606 for exposure in exposureData:
607 with self.butler.transaction():
608 self.insertDimensionData(exposure.records)
609 refs.extend(self.ingestExposureDatasets(exposure))
610 return refs