Coverage for python/lsst/obs/base/ingest.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import os.path
26import itertools
27from dataclasses import dataclass
28from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping
29from collections import defaultdict
30from multiprocessing import Pool
32from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
33from lsst.utils import doImport
34from lsst.afw.fits import readMetadata
35from lsst.daf.butler import (
36 Butler,
37 DataCoordinate,
38 DatasetRef,
39 DatasetType,
40 DimensionRecord,
41 FileDataset,
42)
43from lsst.geom import Box2D
44from lsst.pex.config import Config, Field, ChoiceField
45from lsst.pipe.base import Task
46from lsst.sphgeom import ConvexPolygon
48from .instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo
49from .fitsRawFormatterBase import FitsRawFormatterBase
52@dataclass
53class RawFileDatasetInfo:
54 """Structure that hold information about a single dataset within a
55 raw file.
56 """
58 dataId: DataCoordinate
59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
63 """
65 obsInfo: ObservationInfo
66 """Standardized observation metadata extracted directly from the file
67 headers (`astro_metadata_translator.ObservationInfo`).
68 """
70 region: ConvexPolygon
71 """Region on the sky covered by this file, possibly with padding
72 (`lsst.sphgeom.ConvexPolygon`).
73 """
76@dataclass
77class RawFileData:
78 """Structure that holds information about a single raw file, used during
79 ingest.
80 """
82 datasets: List[RawFileDatasetInfo]
83 """The information describing each dataset within this raw file.
84 (`list` of `RawFileDatasetInfo`)
85 """
87 filename: str
88 """Name of the file this information was extracted from (`str`).
90 This is the path prior to ingest, not the path after ingest.
91 """
93 FormatterClass: Type[FitsRawFormatterBase]
94 """Formatter class that should be used to ingest this file and compute
95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`).
96 """
99@dataclass
100class RawExposureData:
101 """Structure that holds information about a complete raw exposure, used
102 during ingest.
103 """
105 dataId: DataCoordinate
106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or
109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`.
110 """
112 files: List[RawFileData]
113 """List of structures containing file-level information.
114 """
116 records: Optional[Dict[str, List[DimensionRecord]]] = None
117 """Dictionary containing `DimensionRecord` instances that must be inserted
118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`).
120 Keys are the names of dimension elements ("exposure" and optionally "visit"
121 and "visit_detector_region"), while values are lists of `DimensionRecord`.
123 May be `None` during some ingest steps.
124 """
127def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None):
128 """Create a Config field with options for how to transfer files between
129 data repositories.
131 The allowed options for the field are exactly those supported by
132 `lsst.daf.butler.Datastore.ingest`.
134 Parameters
135 ----------
136 doc : `str`
137 Documentation for the configuration field.
139 Returns
140 -------
141 field : `lsst.pex.config.ChoiceField`
142 Configuration field.
143 """
144 return ChoiceField(
145 doc=doc,
146 dtype=str,
147 allowed={"move": "move",
148 "copy": "copy",
149 "auto": "choice will depend on datastore",
150 "link": "hard link falling back to symbolic link",
151 "hardlink": "hard link",
152 "symlink": "symbolic (soft) link",
153 "relsymlink": "relative symbolic link",
154 },
155 optional=True,
156 default=default
157 )
160class RawIngestConfig(Config):
161 transfer = makeTransferChoiceField()
162 padRegionAmount = Field(
163 dtype=int,
164 default=0,
165 doc="Pad an image with specified number of pixels before calculating region"
166 )
167 instrument = Field(
168 doc=("Fully-qualified Python name of the `Instrument` subclass to "
169 "associate with all raws."),
170 dtype=str,
171 optional=False,
172 default=None,
173 )
176class RawIngestTask(Task):
177 """Driver Task for ingesting raw data into Gen3 Butler repositories.
179 This Task is intended to be runnable from the command-line, but it doesn't
180 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't
181 gain much from being one. It also wouldn't really be appropriate as a
182 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to
183 leverage the logging and configurability functionality that provides.
185 Each instance of `RawIngestTask` writes to the same Butler. Each
186 invocation of `RawIngestTask.run` ingests a list of files.
188 Parameters
189 ----------
190 config : `RawIngestConfig`
191 Configuration for the task.
192 butler : `~lsst.daf.butler.Butler`
193 Butler instance. Ingested Datasets will be created as part of
194 ``butler.run`` and associated with its Collection.
195 kwds
196 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
197 constructor.
199 Other keyword arguments are forwarded to the Task base class constructor.
200 """
202 ConfigClass = RawIngestConfig
204 _DefaultName = "ingest"
206 def getDatasetType(self):
207 """Return the DatasetType of the Datasets ingested by this Task.
208 """
209 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
210 universe=self.butler.registry.dimensions)
212 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any):
213 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
214 super().__init__(config, **kwds)
215 self.butler = butler
216 self.universe = self.butler.registry.dimensions
217 self.instrument = doImport(self.config.instrument)()
218 # For now, we get a nominal Camera from the Instrument.
219 # In the future, we may want to load one from a Butler calibration
220 # collection that's appropriate for the observation timestamp of
221 # the exposure.
222 self.camera = self.instrument.getCamera()
223 self.datasetType = self.getDatasetType()
225 def extractMetadata(self, filename: str) -> RawFileData:
226 """Extract and process metadata from a single raw file.
228 Parameters
229 ----------
230 filename : `str`
231 Path to the file.
233 Returns
234 -------
235 data : `RawFileData`
236 A structure containing the metadata extracted from the file,
237 as well as the original filename. All fields will be populated,
238 but the `RawFileData.dataId` attribute will be a minimal
239 (unexpanded) `DataCoordinate` instance.
241 Notes
242 -----
243 Assumes that there is a single dataset associated with the given
244 file. Instruments using a single file to store multiple datasets
245 must implement their own version of this method.
246 """
247 # Manually merge the primary and "first data" headers here because we
248 # do not know in general if an input file has set INHERIT=T.
249 phdu = readMetadata(filename, 0)
250 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
251 fix_header(header)
252 datasets = [self._calculate_dataset_info(header, filename)]
254 # The data model currently assumes that whilst multiple datasets
255 # can be associated with a single file, they must all share the
256 # same formatter.
257 FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId)
259 return RawFileData(datasets=datasets, filename=filename,
260 FormatterClass=FormatterClass)
262 def _calculate_dataset_info(self, header, filename):
263 """Calculate a RawFileDatasetInfo from the supplied information.
265 Parameters
266 ----------
267 header : `Mapping`
268 Header from the dataset.
269 filename : `str`
270 Filename to use for error messages.
272 Returns
273 -------
274 dataset : `RawFileDatasetInfo`
275 The region, dataId, and observation information associated with
276 this dataset.
277 """
278 obsInfo = ObservationInfo(header)
279 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
280 exposure=obsInfo.exposure_id,
281 detector=obsInfo.detector_num,
282 universe=self.universe)
283 if obsInfo.instrument != self.instrument.getName():
284 raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
285 f"got {obsInfo.instrument}) for file {filename}.")
287 FormatterClass = self.instrument.getRawFormatter(dataId)
288 region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass)
289 return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)
291 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
292 """Calculate the sky region covered by the supplied observation
293 information.
295 Parameters
296 ----------
297 obsInfo : `~astro_metadata_translator.ObservationInfo`
298 Summary information of this dataset.
299 header : `Mapping`
300 Header from the dataset.
301 FormatterClass: `type` as subclass of `FitsRawFormatterBase`
302 Formatter class that should be used to compute the spatial region.
304 Returns
305 -------
306 region : `lsst.sphgeom.ConvexPolygon`
307 Region of sky covered by this observation.
308 """
309 if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
310 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
311 visitInfo = formatter.makeVisitInfo()
312 detector = self.camera[obsInfo.detector_num]
313 wcs = formatter.makeWcs(visitInfo, detector)
314 pixBox = Box2D(detector.getBBox())
315 if self.config.padRegionAmount > 0:
316 pixBox.grow(self.config.padRegionAmount)
317 pixCorners = pixBox.getCorners()
318 sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners]
319 region = ConvexPolygon(sphCorners)
320 else:
321 region = None
322 return region
324 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
325 """Group an iterable of `RawFileData` by exposure.
327 Parameters
328 ----------
329 files : iterable of `RawFileData`
330 File-level information to group.
332 Returns
333 -------
334 exposures : `list` of `RawExposureData`
335 A list of structures that group the file-level information by
336 exposure. The `RawExposureData.records` attributes of elements
337 will be `None`, but all other fields will be populated. The
338 `RawExposureData.dataId` attributes will be minimal (unexpanded)
339 `DataCoordinate` instances.
340 """
341 exposureDimensions = self.universe["exposure"].graph
342 byExposure = defaultdict(list)
343 for f in files:
344 # Assume that the first dataset is representative for the file
345 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
347 return [RawExposureData(dataId=dataId, files=exposureFiles)
348 for dataId, exposureFiles in byExposure.items()]
350 def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
351 """Collect the `DimensionRecord` instances that must be inserted into
352 the `~lsst.daf.butler.Registry` before an exposure's raw files may be.
354 Parameters
355 ----------
356 exposure : `RawExposureData`
357 A structure containing information about the exposure to be
358 ingested. Should be considered consumed upon return.
360 Returns
361 -------
362 exposure : `RawExposureData`
363 An updated version of the input structure, with
364 `RawExposureData.records` populated.
365 """
366 firstFile = exposure.files[0]
367 firstDataset = firstFile.datasets[0]
368 VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
369 exposure.records = {
370 "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)],
371 }
372 if firstDataset.obsInfo.visit_id is not None:
373 exposure.records["visit_detector_region"] = []
374 visitVertices = []
375 for file in exposure.files:
376 for dataset in file.datasets:
377 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
378 raise ValueError(f"Inconsistent visit/exposure relationship for "
379 f"exposure {firstDataset.obsInfo.exposure_id} between "
380 f"{file.filename} and {firstFile.filename}: "
381 f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
382 if dataset.region is None:
383 self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
384 dataset.obsInfo.detector_num)
385 continue
386 visitVertices.extend(dataset.region.getVertices())
387 exposure.records["visit_detector_region"].append(
388 VisitDetectorRegionRecordClass.fromDict({
389 "instrument": dataset.obsInfo.instrument,
390 "visit": dataset.obsInfo.visit_id,
391 "detector": dataset.obsInfo.detector_num,
392 "region": dataset.region,
393 })
394 )
395 if visitVertices:
396 visitRegion = ConvexPolygon(visitVertices)
397 else:
398 self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id)
399 visitRegion = None
400 exposure.records["visit"] = [
401 makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion)
402 ]
403 return exposure
405 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
406 """Expand the data IDs associated with a raw exposure to include
407 additional metadata records.
409 Parameters
410 ----------
411 exposure : `RawExposureData`
412 A structure containing information about the exposure to be
413 ingested. Must have `RawExposureData.records` populated. Should
414 be considered consumed upon return.
416 Returns
417 -------
418 exposure : `RawExposureData`
419 An updated version of the input structure, with
420 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
421 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances.
422 """
423 hasVisit = "visit" in data.records
424 # We start by expanded the exposure-level data ID; we won't use that
425 # directly in file ingest, but this lets us do some database lookups
426 # once per exposure instead of once per file later.
427 data.dataId = self.butler.registry.expandDataId(
428 data.dataId,
429 # We pass in the records we'll be inserting shortly so they aren't
430 # looked up from the database. We do expect instrument and filter
431 # records to be retrieved from the database here (though the
432 # Registry may cache them so there isn't a lookup every time).
433 records={
434 "exposure": data.records["exposure"][0],
435 "visit": data.records["visit"][0] if hasVisit else None,
436 }
437 )
438 # Now we expand the per-file (exposure+detector) data IDs. This time
439 # we pass in the records we just retrieved from the exposure data ID
440 # expansion as well as the visit_detector_region record, if there is
441 # one.
442 vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
443 for file, vdrRecord in zip(data.files, vdrRecords):
444 for dataset in file.datasets:
445 dataset.dataId = self.butler.registry.expandDataId(
446 dataset.dataId,
447 records=dict(data.dataId.records, visit_detector_region=vdrRecord)
448 )
449 return data
451 def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
452 """Perform all ingest preprocessing steps that do not involve actually
453 modifying the database.
455 Parameters
456 ----------
457 files : iterable over `str` or path-like objects
458 Paths to the files to be ingested. Will be made absolute
459 if they are not already.
460 pool : `multiprocessing.Pool`, optional
461 If not `None`, a process pool with which to parallelize some
462 operations.
463 processes : `int`, optional
464 The number of processes to use. Ignored if ``pool`` is not `None`.
466 Yields
467 ------
468 exposure : `RawExposureData`
469 Data structures containing dimension records, filenames, and data
470 IDs to be ingested (one structure for each exposure).
471 """
472 if pool is None and processes > 1:
473 pool = Pool(processes)
474 mapFunc = map if pool is None else pool.imap_unordered
476 # Extract metadata and build per-detector regions.
477 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
479 # Use that metadata to group files (and extracted metadata) by
480 # exposure. Never parallelized because it's intrinsically a gather
481 # step.
482 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
484 # The next few operations operate on RawExposureData instances (one at
485 # a time) in-place and then return the modified instance. We call them
486 # as pass-throughs instead of relying on the arguments we pass in to
487 # have been modified because in the parallel case those arguments are
488 # going to be pickled and unpickled, and I'm not certain
489 # multiprocessing is careful enough with that for output arguments to
490 # work. We use the same variable names to reflect the fact that we
491 # consider the arguments to have been consumed/invalidated.
493 # Extract DimensionRecords from the metadata that will need to be
494 # inserted into the Registry before the raw datasets themselves are
495 # ingested.
496 exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData)
498 # Expand the data IDs to include all dimension metadata; we need this
499 # because we may need to generate path templates that rely on that
500 # metadata.
501 # This is the first step that involves actual database calls (but just
502 # SELECTs), so if there's going to be a problem with connections vs.
503 # multiple processes, or lock contention (in SQLite) slowing things
504 # down, it'll happen here.
505 return mapFunc(self.expandDataIds, exposureData)
507 def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]):
508 """Insert dimension records for one or more exposures.
510 Parameters
511 ----------
512 records : `dict` mapping `str` to `list`
513 Dimension records to be inserted, organized as a mapping from
514 dimension name to a list of records for that dimension. This
515 may be a single `RawExposureData.records` dict, or an aggregate
516 for multiple exposures created by concatenating the value lists
517 of those dictionaries.
519 Returns
520 -------
521 refs : `list` of `lsst.daf.butler.DatasetRef`
522 Dataset references for ingested raws.
523 """
524 # TODO: This currently assumes that either duplicate inserts of
525 # visit records are ignored, or there is exactly one visit per
526 # exposure. I expect us to switch up the visit-exposure
527 # relationship and hence rewrite some of this code before that
528 # becomes a practical problem.
529 # Iterate over dimensions explicitly to order for foreign key
530 # relationships.
531 for dimension in ("visit", "exposure", "visit_detector_region"):
532 recordsForDimension = records.get(dimension)
533 if recordsForDimension:
534 # TODO: once Registry has options to ignore or replace
535 # existing dimension records with the same primary keys
536 # instead of aborting on conflicts, add configuration
537 # options and logic to use them.
538 self.butler.registry.insertDimensionData(dimension, *recordsForDimension)
540 def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None
541 ) -> List[DatasetRef]:
542 """Ingest all raw files in one exposure.
544 Parameters
545 ----------
546 exposure : `RawExposureData`
547 A structure containing information about the exposure to be
548 ingested. Must have `RawExposureData.records` populated and all
549 data ID attributes expanded.
550 butler : `lsst.daf.butler.Butler`, optional
551 Butler to use for ingest. If not provided, ``self.butler`` will
552 be used.
554 Returns
555 -------
556 refs : `list` of `lsst.daf.butler.DatasetRef`
557 Dataset references for ingested raws.
558 """
559 if butler is None:
560 butler = self.butler
561 datasets = [FileDataset(path=os.path.abspath(file.filename),
562 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
563 formatter=file.FormatterClass)
564 for file in exposure.files]
565 butler.ingest(*datasets, transfer=self.config.transfer)
566 return [ref for dataset in datasets for ref in dataset.refs]
568 def run(self, files, pool: Optional[Pool] = None, processes: int = 1):
569 """Ingest files into a Butler data repository.
571 This creates any new exposure or visit Dimension entries needed to
572 identify the ingested files, creates new Dataset entries in the
573 Registry and finally ingests the files themselves into the Datastore.
574 Any needed instrument, detector, and physical_filter Dimension entries
575 must exist in the Registry before `run` is called.
577 Parameters
578 ----------
579 files : iterable over `str` or path-like objects
580 Paths to the files to be ingested. Will be made absolute
581 if they are not already.
582 pool : `multiprocessing.Pool`, optional
583 If not `None`, a process pool with which to parallelize some
584 operations.
585 processes : `int`, optional
586 The number of processes to use. Ignored if ``pool`` is not `None`.
588 Returns
589 -------
590 refs : `list` of `lsst.daf.butler.DatasetRef`
591 Dataset references for ingested raws.
593 Notes
594 -----
595 This method inserts all records (dimensions and datasets) for an
596 exposure within a transaction, guaranteeing that partial exposures
597 are never ingested.
598 """
599 exposureData = self.prep(files, pool=pool, processes=processes)
600 # Up to this point, we haven't modified the data repository at all.
601 # Now we finally do that, with one transaction per exposure. This is
602 # not parallelized at present because the performance of this step is
603 # limited by the database server. That may or may not change in the
604 # future once we increase our usage of bulk inserts and reduce our
605 # usage of savepoints; we've tried to get everything but the database
606 # operations done in advance to reduce the time spent inside
607 # transactions.
608 self.butler.registry.registerDatasetType(self.datasetType)
609 refs = []
610 for exposure in exposureData:
611 with self.butler.transaction():
612 self.insertDimensionData(exposure.records)
613 refs.extend(self.ingestExposureDatasets(exposure))
614 return refs