23 __all__ = (
"RawIngestTask",
"RawIngestConfig")
26 from abc
import ABCMeta
30 from sqlalchemy.exc
import IntegrityError
32 from astro_metadata_translator
import ObservationInfo
33 from lsst.afw.image
import readMetadata, bboxFromMetadata
34 from lsst.afw.geom
import SkyWcs
35 from lsst.daf.butler
import DatasetType, StorageClassFactory, Run, DataId, ConflictingDefinitionError
36 from lsst.daf.butler.instrument
import (Instrument, updateExposureEntryFromObsInfo,
37 updateVisitEntryFromObsInfo)
38 from lsst.geom
import Box2D
39 from lsst.pex.config
import Config, Field, ChoiceField
40 from lsst.pipe.base
import Task
41 from lsst.sphgeom
import ConvexPolygon
49 transfer = ChoiceField(
50 (
"How to transfer files (None for no transfer)."),
52 allowed={
"move":
"move",
54 "hardlink":
"hard link",
55 "symlink":
"symbolic (soft) link"},
58 conflict = ChoiceField(
59 (
"What to do if a raw Dataset with the same data ID as an " 60 "ingested file already exists in the Butler's Collection."),
62 allowed={
"ignore": (
"Do not add the new file to the Collection. If " 63 "'stash' is not None, the new file will be " 64 "ingested into the stash Collection instead."),
65 "fail": (
"Raise RuntimeError if a conflict is encountered " 66 "(which may then be caught if onError == 'continue')."),
72 "Name of an alternate Collection to hold Datasets that lose conflicts.",
76 onError = ChoiceField(
77 "What to do if an error (including fatal conflicts) occurs.",
79 allowed={
"continue":
"Warn and continue with the next file.",
80 "break": (
"Stop processing immediately, but leave " 81 "already-ingested datasets in the repository."),
82 "rollback": (
"Stop processing and attempt to remove aleady-" 83 "ingested datasets from the repository."),
91 doc=
"Add regions when ingesting tasks" 93 padRegionAmount = Field(
96 doc=
"Pad an image with specified number of pixels before calculating region" 101 """Driver Task for ingesting raw data into Gen3 Butler repositories. 103 This Task is intended to be runnable from the command-line, but it doesn't 104 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't 105 gain much from being one. It also wouldn't really be appropriate as a 106 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to 107 leverage the logging and configurability functionality that provides. 109 Each instance of `RawIngestTask` writes to the same Butler and maintains a 110 cache of Dimension entries that have already been added to or extracted 111 from its Registry. Each invocation of `RawIngestTask.run` ingests a list 112 of files (possibly semi-atomically; see `RawIngestConfig.onError`). 114 RawIngestTask should be subclassed to specialize ingest for the actual 115 structure of raw data files produced by a particular instrument. 116 Subclasses must either provide populated `MetadataReader` instances in the 117 `dataIdReader`, `visitReader`, and `exposureReader` class attributes, or 118 alternate implementations of the `extractDataId`, `extractVisit`, and 119 `extractExposure` methods that do not use those attributes (each 120 attribute-method pair may be handled differently). Subclasses may also 121 wish to override `getFormatter` and/or (rarely) `getDatasetType`. We do 122 not anticipate overriding `run`, `ensureDimensions`, `ingestFile`, or 123 `processFile` to ever be necessary. 127 config : `RawIngestConfig` 128 Configuration for whether/how to transfer files and how to handle 129 conflicts and errors. 130 butler : `~lsst.daf.butler.Butler` 131 Butler instance. Ingested Datasets will be created as part of 132 ``butler.run`` and associated with its Collection. 134 Other keyword arguments are forwarded to the Task base class constructor. 137 ConfigClass = RawIngestConfig
139 _DefaultName =
"ingest" 143 """Return the DatasetType of the Datasets ingested by this Task. 145 return DatasetType(
"raw", (
"instrument",
"detector",
"exposure"),
146 StorageClassFactory().getStorageClass(
"Exposure"))
148 def __init__(self, config=None, *, butler, **kwds):
152 self.
dimensions = butler.registry.dimensions.extract([
"instrument",
"detector",
"physical_filter",
153 "visit",
"exposure"])
163 self.
stashRun = Run(self.config.stash)
if self.config.stash
is not None else None 166 def _addVisitRegions(self):
167 """Adds a region associated with a Visit to registry. 169 Visits will be created using regions for individual ccds that are 170 defined in the visitRegions dict field on self, joined against an 171 existing region if one exists. The dict field is formatted using 172 instrument and visit as a tuple for a key, with values that are a 173 list of regions for detectors associated the region. 175 for (instrument, visit), vertices
in self.
visitRegions.items():
177 existingRegion = self.
butler.registry.expandDataId({
"instrument": instrument,
"visit": visit},
179 if existingRegion
is not None:
180 vertices = list(existingRegion.getVertices()) + vertices
181 region = ConvexPolygon(vertices)
182 self.
butler.registry.setDimensionRegion(instrument=instrument, visit=visit, region=region)
185 """Ingest files into a Butler data repository. 187 This creates any new exposure or visit Dimension entries needed to 188 identify the ingested files, creates new Dataset entries in the 189 Registry and finally ingests the files themselves into the Datastore. 190 Any needed instrument, detector, and physical_filter Dimension entries 191 must exist in the Registry before `run` is called. 195 files : iterable over `str` or path-like objects 196 Paths to the files to be ingested. Will be made absolute 197 if they are not already. 200 if self.config.onError ==
"rollback":
201 with self.
butler.transaction():
204 if self.config.doAddRegions:
206 elif self.config.onError ==
"break":
209 if self.config.doAddRegions:
211 elif self.config.onError ==
"continue":
215 except Exception
as err:
216 self.log.warnf(
"Error processing '{}': {}", file, err)
217 if self.config.doAddRegions:
221 """Read and return any relevant headers from the given file. 223 The default implementation simply reads the header of the first 224 non-empty HDU, so it always returns a single-element list. 228 file : `str` or path-like object 229 Absolute path to the file to be ingested. 233 headers : `list` of `~lsst.daf.base.PropertyList` 234 Single-element list containing the header of the first 237 return [readMetadata(file)]
240 """Builds a region from information contained in a header 244 headers : `lsst.daf.base.PropertyList` 245 Property list containing the information from the header of 250 region : `lsst.sphgeom.ConvexPolygon` 255 If required header keys can not be found to construct region 260 bbox = Box2D(bboxFromMetadata(header))
261 if self.config.padRegionAmount > 0:
262 bbox.grow(self.config.padRegionAmount)
263 corners = bbox.getCorners()
264 sphCorners = [wcs.pixelToSky(point).getVector()
for point
in corners]
265 return ConvexPolygon(sphCorners)
268 """Extract metadata from a raw file and add exposure and visit 271 Any needed instrument, detector, and physical_filter Dimension entries must 272 exist in the Registry before `run` is called. 276 file : `str` or path-like object 277 Absolute path to the file to be ingested. 281 headers : `list` of `~lsst.daf.base.PropertyList` 282 Result of calling `readHeaders`. 284 Data ID dictionary, as returned by `extractDataId`. 287 obsInfo = ObservationInfo(headers[0])
290 fullDataId = self.
extractDataId(file, headers, obsInfo=obsInfo)
293 dimensionDataId = DataId(fullDataId, dimension=dimension)
296 dimensionEntryDict = self.
butler.registry.findDimensionEntry(dimension, dimensionDataId)
297 if dimensionEntryDict
is None:
298 if dimension.name
in (
"visit",
"exposure"):
300 self.
butler.registry.addDimensionEntry(dimension, dimensionDataId)
303 f
"Entry for {dimension.name} with ID {dimensionDataId} not found; must be " 304 f
"present in Registry prior to ingest." 309 if self.config.doAddRegions:
312 self.
butler.registry.setDimensionRegion(DataId(fullDataId,
313 dimensions=[
'visit',
'detector',
'instrument'],
316 self.
visitRegions.setdefault((fullDataId[
'instrument'], fullDataId[
'visit']),
317 []).extend(region.getVertices())
318 except IntegrityError:
323 return headers, fullDataId
326 """Ingest a single raw file into the repository. 328 All necessary Dimension entres must already be present. 330 This method is not transactional; it must be wrapped in a 331 ``with self.butler.transaction` block to make per-file ingest 336 file : `str` or path-like object 337 Absolute path to the file to be ingested. 338 headers : `list` of `~lsst.daf.base.PropertyList` 339 Result of calling `readHeaders`. 341 Data ID dictionary, as returned by `extractDataId`. 342 run : `~lsst.daf.butler.Run`, optional 343 Run to add the Dataset to; defaults to ``self.butler.run``. 350 ref = self.
butler.registry.addDataset(self.
datasetType, dataId, run=run, recursive=
True)
351 except ConflictingDefinitionError
as err:
355 self.
butler.datastore.ingest(file, ref, formatter=self.
getFormatter(file, headers, dataId),
356 transfer=self.config.transfer)
360 """Ingest a single raw data file after extacting metadata. 362 This creates any new exposure or visit Dimension entries needed to 363 identify the ingest file, creates a new Dataset entry in the 364 Registry and finally ingests the file itself into the Datastore. 365 Any needed instrument, detector, and physical_filter Dimension entries must 366 exist in the Registry before `run` is called. 370 file : `str` or path-like object 371 Absolute path to the file to be ingested. 376 with self.
butler.transaction():
380 except IngestConflictError:
381 if self.config.conflict ==
"fail":
383 if self.config.conflict ==
"ignore":
387 self.log.infof(
"Conflict on {} ({}); ingesting to stash '{}' instead.",
388 dataId, file, self.config.stash)
389 with self.
butler.transaction():
392 self.log.infof(
"Conflict on {} ({}); ignoring.", dataId, file)
395 """Return the Data ID dictionary that should be used to label a file. 399 file : `str` or path-like object 400 Absolute path to the file being ingested (prior to any transfers). 401 headers : `list` of `~lsst.daf.base.PropertyList` 402 All headers returned by `readHeaders()`. 403 obsInfo : `astro_metadata_translator.ObservationInfo` 404 Observational metadata extracted from the headers. 409 A mapping whose key-value pairs uniquely identify raw datasets. 410 Must have ``dataId.dimensions() <= self.dimensions``, with at least 411 instrument, exposure, and detector present. 414 if obsInfo.visit_id
is None:
415 toRemove.add(
"visit")
416 if obsInfo.physical_filter
is None:
417 toRemove.add(
"physical_filter")
419 dimensions = self.
dimensions.difference(toRemove)
423 dimensions=dimensions,
424 instrument=obsInfo.instrument,
425 exposure=obsInfo.exposure_id,
426 visit=obsInfo.visit_id,
427 detector=obsInfo.detector_num,
428 physical_filter=obsInfo.physical_filter,
430 updateExposureEntryFromObsInfo(dataId, obsInfo)
431 if obsInfo.visit_id
is not None:
432 updateVisitEntryFromObsInfo(dataId, obsInfo)
436 """Return the Formatter that should be used to read this file after 439 The default implementation obtains the formatter from the Instrument 440 class for the given data ID. 443 if instrument
is None:
444 instrument = Instrument.factories[dataId[
"instrument"]]()
446 return instrument.getRawFormatter(dataId)
def ensureDimensions(self, file)
def extractDataId(self, file, headers, obsInfo)
def readHeaders(self, file)
def __init__(self, config=None, butler, kwds)
def buildRegion(self, headers)
def ingestFile(self, file, headers, dataId, run=None)
def _addVisitRegions(self)
def getFormatter(self, file, headers, dataId)
def processFile(self, file)