Coverage for python/lsst/obs/base/ingest.py : 41%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField")
25import os.path
26from dataclasses import dataclass, InitVar
27from typing import List, Iterator, Iterable, Type, Optional, Any
28from collections import defaultdict
29from multiprocessing import Pool
31from astro_metadata_translator import ObservationInfo, fix_header, merge_headers
32from lsst.afw.fits import readMetadata
33from lsst.daf.butler import (
34 Butler,
35 CollectionType,
36 DataCoordinate,
37 DatasetRef,
38 DatasetType,
39 DimensionRecord,
40 DimensionUniverse,
41 FileDataset,
42)
43from lsst.pex.config import Config, ChoiceField
44from lsst.pipe.base import Task
46from ._instrument import Instrument, makeExposureRecordFromObsInfo
47from ._fitsRawFormatterBase import FitsRawFormatterBase
50@dataclass
51class RawFileDatasetInfo:
52 """Structure that holds information about a single dataset within a
53 raw file.
54 """
56 dataId: DataCoordinate
57 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).
58 """
60 obsInfo: ObservationInfo
61 """Standardized observation metadata extracted directly from the file
62 headers (`astro_metadata_translator.ObservationInfo`).
63 """
66@dataclass
67class RawFileData:
68 """Structure that holds information about a single raw file, used during
69 ingest.
70 """
72 datasets: List[RawFileDatasetInfo]
73 """The information describing each dataset within this raw file.
74 (`list` of `RawFileDatasetInfo`)
75 """
77 filename: str
78 """Name of the file this information was extracted from (`str`).
80 This is the path prior to ingest, not the path after ingest.
81 """
83 FormatterClass: Type[FitsRawFormatterBase]
84 """Formatter class that should be used to ingest this file (`type`; as
85 subclass of `FitsRawFormatterBase`).
86 """
88 instrumentClass: Type[Instrument]
89 """The `Instrument` class associated with this file."""
92@dataclass
93class RawExposureData:
94 """Structure that holds information about a complete raw exposure, used
95 during ingest.
96 """
98 dataId: DataCoordinate
99 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`).
100 """
102 files: List[RawFileData]
103 """List of structures containing file-level information.
104 """
106 universe: InitVar[DimensionUniverse]
107 """Set of all known dimensions.
108 """
110 record: Optional[DimensionRecord] = None
111 """The exposure `DimensionRecord` that must be inserted into the
112 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`).
113 """
115 def __post_init__(self, universe: DimensionUniverse):
116 # We don't care which file or dataset we read metadata from, because
117 # we're assuming they'll all be the same; just use the first ones.
118 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe)
121def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"):
122 """Create a Config field with options for how to transfer files between
123 data repositories.
125 The allowed options for the field are exactly those supported by
126 `lsst.daf.butler.Datastore.ingest`.
128 Parameters
129 ----------
130 doc : `str`
131 Documentation for the configuration field.
133 Returns
134 -------
135 field : `lsst.pex.config.ChoiceField`
136 Configuration field.
137 """
138 return ChoiceField(
139 doc=doc,
140 dtype=str,
141 allowed={"move": "move",
142 "copy": "copy",
143 "auto": "choice will depend on datastore",
144 "link": "hard link falling back to symbolic link",
145 "hardlink": "hard link",
146 "symlink": "symbolic (soft) link",
147 "relsymlink": "relative symbolic link",
148 },
149 optional=True,
150 default=default
151 )
154class RawIngestConfig(Config):
155 transfer = makeTransferChoiceField()
158class RawIngestTask(Task):
159 """Driver Task for ingesting raw data into Gen3 Butler repositories.
161 Parameters
162 ----------
163 config : `RawIngestConfig`
164 Configuration for the task.
165 butler : `~lsst.daf.butler.Butler`
166 Writeable butler instance, with ``butler.run`` set to the appropriate
167 `~lsst.daf.butler.CollectionType.RUN` collection for these raw
168 datasets.
169 **kwargs
170 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task`
171 constructor.
173 Notes
174 -----
175 Each instance of `RawIngestTask` writes to the same Butler. Each
176 invocation of `RawIngestTask.run` ingests a list of files.
177 """
179 ConfigClass = RawIngestConfig
181 _DefaultName = "ingest"
183 def getDatasetType(self):
184 """Return the DatasetType of the datasets ingested by this Task.
185 """
186 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure",
187 universe=self.butler.registry.dimensions)
189 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any):
190 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
191 super().__init__(config, **kwargs)
192 self.butler = butler
193 self.universe = self.butler.registry.dimensions
194 self.datasetType = self.getDatasetType()
196 # Import all the instrument classes so that we ensure that we
197 # have all the relevant metadata translators loaded.
198 Instrument.importAll(self.butler.registry)
200 def extractMetadata(self, filename: str) -> RawFileData:
201 """Extract and process metadata from a single raw file.
203 Parameters
204 ----------
205 filename : `str`
206 Path to the file.
208 Returns
209 -------
210 data : `RawFileData`
211 A structure containing the metadata extracted from the file,
212 as well as the original filename. All fields will be populated,
213 but the `RawFileData.dataId` attribute will be a minimal
214 (unexpanded) `DataCoordinate` instance.
216 Notes
217 -----
218 Assumes that there is a single dataset associated with the given
219 file. Instruments using a single file to store multiple datasets
220 must implement their own version of this method.
221 """
222 # Manually merge the primary and "first data" headers here because we
223 # do not know in general if an input file has set INHERIT=T.
224 phdu = readMetadata(filename, 0)
225 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
226 fix_header(header)
227 datasets = [self._calculate_dataset_info(header, filename)]
229 # The data model currently assumes that whilst multiple datasets
230 # can be associated with a single file, they must all share the
231 # same formatter.
232 instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry)
233 FormatterClass = instrument.getRawFormatter(datasets[0].dataId)
235 return RawFileData(datasets=datasets, filename=filename,
236 FormatterClass=FormatterClass,
237 instrumentClass=instrument)
239 def _calculate_dataset_info(self, header, filename):
240 """Calculate a RawFileDatasetInfo from the supplied information.
242 Parameters
243 ----------
244 header : `Mapping`
245 Header from the dataset.
246 filename : `str`
247 Filename to use for error messages.
249 Returns
250 -------
251 dataset : `RawFileDatasetInfo`
252 The dataId, and observation information associated with this
253 dataset.
254 """
255 obsInfo = ObservationInfo(header)
256 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
257 exposure=obsInfo.exposure_id,
258 detector=obsInfo.detector_num,
259 universe=self.universe)
260 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
262 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
263 """Group an iterable of `RawFileData` by exposure.
265 Parameters
266 ----------
267 files : iterable of `RawFileData`
268 File-level information to group.
270 Returns
271 -------
272 exposures : `list` of `RawExposureData`
273 A list of structures that group the file-level information by
274 exposure. All fields will be populated. The
275 `RawExposureData.dataId` attributes will be minimal (unexpanded)
276 `DataCoordinate` instances.
277 """
278 exposureDimensions = self.universe["exposure"].graph
279 byExposure = defaultdict(list)
280 for f in files:
281 # Assume that the first dataset is representative for the file
282 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)
284 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe)
285 for dataId, exposureFiles in byExposure.items()]
287 def expandDataIds(self, data: RawExposureData) -> RawExposureData:
288 """Expand the data IDs associated with a raw exposure to include
289 additional metadata records.
291 Parameters
292 ----------
293 exposure : `RawExposureData`
294 A structure containing information about the exposure to be
295 ingested. Must have `RawExposureData.records` populated. Should
296 be considered consumed upon return.
298 Returns
299 -------
300 exposure : `RawExposureData`
301 An updated version of the input structure, with
302 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes
303 updated to data IDs for which `DataCoordinate.hasRecords` returns
304 `True`.
305 """
306 # We start by expanded the exposure-level data ID; we won't use that
307 # directly in file ingest, but this lets us do some database lookups
308 # once per exposure instead of once per file later.
309 data.dataId = self.butler.registry.expandDataId(
310 data.dataId,
311 # We pass in the records we'll be inserting shortly so they aren't
312 # looked up from the database. We do expect instrument and filter
313 # records to be retrieved from the database here (though the
314 # Registry may cache them so there isn't a lookup every time).
315 records={
316 self.butler.registry.dimensions["exposure"]: data.record,
317 }
318 )
319 # Now we expand the per-file (exposure+detector) data IDs. This time
320 # we pass in the records we just retrieved from the exposure data ID
321 # expansion.
322 for file in data.files:
323 for dataset in file.datasets:
324 dataset.dataId = self.butler.registry.expandDataId(
325 dataset.dataId,
326 records=dict(data.dataId.records)
327 )
328 return data
330 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
331 """Perform all ingest preprocessing steps that do not involve actually
332 modifying the database.
334 Parameters
335 ----------
336 files : iterable over `str` or path-like objects
337 Paths to the files to be ingested. Will be made absolute
338 if they are not already.
339 pool : `multiprocessing.Pool`, optional
340 If not `None`, a process pool with which to parallelize some
341 operations.
342 processes : `int`, optional
343 The number of processes to use. Ignored if ``pool`` is not `None`.
345 Yields
346 ------
347 exposure : `RawExposureData`
348 Data structures containing dimension records, filenames, and data
349 IDs to be ingested (one structure for each exposure).
350 """
351 if pool is None and processes > 1:
352 pool = Pool(processes)
353 mapFunc = map if pool is None else pool.imap_unordered
355 # Extract metadata and build per-detector regions.
356 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files)
358 # Use that metadata to group files (and extracted metadata) by
359 # exposure. Never parallelized because it's intrinsically a gather
360 # step.
361 exposureData: List[RawExposureData] = self.groupByExposure(fileData)
363 # The next operation operates on RawExposureData instances (one at
364 # a time) in-place and then returns the modified instance. We call it
365 # as a pass-through instead of relying on the arguments we pass in to
366 # have been modified because in the parallel case those arguments are
367 # going to be pickled and unpickled, and I'm not certain
368 # multiprocessing is careful enough with that for output arguments to
369 # work.
371 # Expand the data IDs to include all dimension metadata; we need this
372 # because we may need to generate path templates that rely on that
373 # metadata.
374 # This is the first step that involves actual database calls (but just
375 # SELECTs), so if there's going to be a problem with connections vs.
376 # multiple processes, or lock contention (in SQLite) slowing things
377 # down, it'll happen here.
378 return mapFunc(self.expandDataIds, exposureData)
380 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None
381 ) -> List[DatasetRef]:
382 """Ingest all raw files in one exposure.
384 Parameters
385 ----------
386 exposure : `RawExposureData`
387 A structure containing information about the exposure to be
388 ingested. Must have `RawExposureData.records` populated and all
389 data ID attributes expanded.
390 run : `str`, optional
391 Name of a RUN-type collection to write to, overriding
392 ``self.butler.run``.
394 Returns
395 -------
396 refs : `list` of `lsst.daf.butler.DatasetRef`
397 Dataset references for ingested raws.
398 """
399 datasets = [FileDataset(path=os.path.abspath(file.filename),
400 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
401 formatter=file.FormatterClass)
402 for file in exposure.files]
403 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run)
404 return [ref for dataset in datasets for ref in dataset.refs]
406 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None):
407 """Ingest files into a Butler data repository.
409 This creates any new exposure or visit Dimension entries needed to
410 identify the ingested files, creates new Dataset entries in the
411 Registry and finally ingests the files themselves into the Datastore.
412 Any needed instrument, detector, and physical_filter Dimension entries
413 must exist in the Registry before `run` is called.
415 Parameters
416 ----------
417 files : iterable over `str` or path-like objects
418 Paths to the files to be ingested. Will be made absolute
419 if they are not already.
420 pool : `multiprocessing.Pool`, optional
421 If not `None`, a process pool with which to parallelize some
422 operations.
423 processes : `int`, optional
424 The number of processes to use. Ignored if ``pool`` is not `None`.
425 run : `str`, optional
426 Name of a RUN-type collection to write to, overriding
427 the default derived from the instrument name.
429 Returns
430 -------
431 refs : `list` of `lsst.daf.butler.DatasetRef`
432 Dataset references for ingested raws.
434 Notes
435 -----
436 This method inserts all datasets for an exposure within a transaction,
437 guaranteeing that partial exposures are never ingested. The exposure
438 dimension record is inserted with `Registry.syncDimensionData` first
439 (in its own transaction), which inserts only if a record with the same
440 primary key does not already exist. This allows different files within
441 the same exposure to be incremented in different runs.
442 """
443 exposureData = self.prep(files, pool=pool, processes=processes)
444 # Up to this point, we haven't modified the data repository at all.
445 # Now we finally do that, with one transaction per exposure. This is
446 # not parallelized at present because the performance of this step is
447 # limited by the database server. That may or may not change in the
448 # future once we increase our usage of bulk inserts and reduce our
449 # usage of savepoints; we've tried to get everything but the database
450 # operations done in advance to reduce the time spent inside
451 # transactions.
452 self.butler.registry.registerDatasetType(self.datasetType)
453 refs = []
454 runs = set()
455 for exposure in exposureData:
456 self.butler.registry.syncDimensionData("exposure", exposure.record)
457 # Override default run if nothing specified explicitly
458 if run is None:
459 instrumentClass = exposure.files[0].instrumentClass
460 this_run = instrumentClass.makeDefaultRawIngestRunName()
461 else:
462 this_run = run
463 if this_run not in runs:
464 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN)
465 runs.add(this_run)
466 with self.butler.transaction():
467 refs.extend(self.ingestExposureDatasets(exposure, run=this_run))
468 return refs