Coverage for python/lsst/obs/base/gen2to3/calibRepoConverter.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["CalibRepoConverter"]
25from collections import defaultdict
26import os
27import sqlite3
28from typing import TYPE_CHECKING, Dict, Iterator, List, Mapping, Sequence, Tuple, Optional
30import astropy.time
31import astropy.units as u
33from lsst.daf.butler import DataCoordinate, FileDataset, Timespan
34from .repoConverter import RepoConverter
35from .repoWalker import RepoWalker
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from lsst.daf.butler import DatasetType, StorageClass, FormatterParameter
39 from .repoWalker.scanner import PathElementHandler
40 from ..cameraMapper import CameraMapper
41 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
44class CalibRepoConverter(RepoConverter):
45 """A specialization of `RepoConverter` for calibration repositories.
47 Parameters
48 ----------
49 mapper : `CameraMapper`
50 Gen2 mapper for the data repository. The root associated with the
51 mapper is ignored and need not match the root of the repository.
52 labels : `Sequence` [ `str` ]
53 Strings injected into the names of the collections that calibration
54 datasets are written and certified into (forwarded as the ``extra``
55 argument to `Instrument` methods that generate collection names and
56 write curated calibrations).
57 **kwargs
58 Additional keyword arguments are forwarded to (and required by)
59 `RepoConverter`.
60 """
62 def __init__(self, *, mapper: CameraMapper, labels: Sequence[str] = (), **kwargs):
63 super().__init__(run=None, **kwargs)
64 self.mapper = mapper
65 self.collection = self.task.instrument.makeCalibrationCollectionName(*labels)
66 self._labels = tuple(labels)
67 self._datasetTypes = set()
69 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
70 # Docstring inherited from RepoConverter.
71 return datasetTypeName in self.instrument.getCuratedCalibrationNames()
73 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
74 # Docstring inherited from RepoConverter.
75 yield from self.mapper.calibrations.items()
77 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
78 storageClass: StorageClass, formatter: FormatterParameter = None,
79 targetHandler: Optional[PathElementHandler] = None,
80 ) -> RepoWalker.Target:
81 # Docstring inherited from RepoConverter.
82 target = RepoWalker.Target(
83 datasetTypeName=datasetTypeName,
84 storageClass=storageClass,
85 template=template,
86 keys=keys,
87 instrument=self.task.instrument.getName(),
88 universe=self.task.registry.dimensions,
89 formatter=formatter,
90 targetHandler=targetHandler,
91 translatorFactory=self.task.translatorFactory,
92 )
93 self._datasetTypes.add(target.datasetType)
94 return target
96 def _queryGen2CalibRegistry(self, db: sqlite3.Connection, datasetType: DatasetType, calibDate: str
97 ) -> Iterator[sqlite3.Row]:
98 """Query the Gen2 calibration registry for the validity ranges and
99 optionally detectors and filters associated with the given dataset type
100 and ``calibDate``.
102 Parameters
103 ----------
104 db : `sqlite3.Connection`
105 DBAPI connection to the Gen2 ``calibRegistry.sqlite3`` file.
106 datasetType : `DatasetType`
107 Gen3 dataset type being queried.
108 calibDate : `str`
109 String extracted from the ``calibDate`` template entry in Gen2
110 filenames.
112 Yields
113 ------
114 row : `sqlite3.Row`
115 SQLite result object; will have ``validStart`` and ``validEnd``
116 columns, may have a detector column (named
117 ``self.task.config.ccdKey``) and/or a ``filter`` column, depending
118 on whether ``datasetType.dimensions`` includes ``detector`` and
119 ``physical_filter``, respectively.
120 """
121 fields = ["validStart", "validEnd"]
122 if "detector" in datasetType.dimensions.names:
123 fields.append(self.task.config.ccdKey)
124 else:
125 fields.append(f"NULL AS {self.task.config.ccdKey}")
126 if "physical_filter" in datasetType.dimensions.names:
127 fields.append("filter")
128 else:
129 assert "band" not in datasetType.dimensions.names
130 fields.append("NULL AS filter")
131 tables = self.mapper.mappings[datasetType.name].tables
132 if tables is None or len(tables) == 0:
133 self.task.log.warn("Could not extract calibration ranges for %s in %s; "
134 "no tables in Gen2 mapper.",
135 datasetType.name, self.root, tables[0])
136 return
137 query = f"SELECT DISTINCT {', '.join(fields)} FROM {tables[0]} WHERE calibDate = ?;"
138 try:
139 results = db.execute(query, (calibDate,))
140 except sqlite3.OperationalError as e:
141 self.task.log.warn("Could not extract calibration ranges for %s in %s from table %s: %r",
142 datasetType.name, self.root, tables[0], e)
143 return
144 yield from results
146 def _finish(self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]]):
147 # Docstring inherited from RepoConverter.
148 # Read Gen2 calibration repository and extract validity ranges for
149 # all datasetType + calibDate combinations we ingested.
150 calibFile = os.path.join(self.root, "calibRegistry.sqlite3")
151 # If the registry file does not exist this indicates a problem.
152 # We check explicitly because sqlite will try to create the
153 # missing file if it can.
154 if not os.path.exists(calibFile):
155 raise RuntimeError("Attempting to convert calibrations but no registry database"
156 f" found in {self.root}")
158 # Initially we collate timespans for each dataId + dataset type
159 # combination. This allows us to check for small gaps or overlaps
160 # inherent in the ambiguous usage of validity ranges in gen2
161 timespansByDataId = defaultdict(list)
163 db = sqlite3.connect(calibFile)
164 db.row_factory = sqlite3.Row
166 for datasetType, datasetsByCalibDate in datasets.items():
167 if not datasetType.isCalibration():
168 continue
169 gen2keys = {}
170 if "detector" in datasetType.dimensions.names:
171 gen2keys[self.task.config.ccdKey] = int
172 if "physical_filter" in datasetType.dimensions.names:
173 gen2keys["filter"] = str
174 translator = self.instrument.makeDataIdTranslatorFactory().makeMatching(
175 datasetType.name,
176 gen2keys,
177 instrument=self.instrument.getName()
178 )
179 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items():
180 assert calibDate is not None, ("datasetType.isCalibration() is set by "
181 "the presence of calibDate in the Gen2 template")
182 # Build a mapping that lets us find DatasetRefs by data ID,
183 # for this DatasetType and calibDate. We know there is only
184 # one ref for each data ID (given DatasetType and calibDate as
185 # well).
186 refsByDataId = {}
187 for dataset in datasetsForCalibDate:
188 refsByDataId.update((ref.dataId, ref) for ref in dataset.refs)
189 # Query the Gen2 calibration repo for the validity ranges for
190 # this DatasetType and calibDate, and look up the appropriate
191 # refs by data ID.
192 for row in self._queryGen2CalibRegistry(db, datasetType, calibDate):
193 # For validity times we use TAI as some gen2 repos have
194 # validity dates very far in the past or future.
195 timespan = Timespan(
196 astropy.time.Time(row["validStart"], format="iso", scale="tai"),
197 astropy.time.Time(row["validEnd"], format="iso", scale="tai"),
198 )
199 # Make a Gen2 data ID from query results.
200 gen2id = {}
201 if "detector" in datasetType.dimensions.names:
202 gen2id[self.task.config.ccdKey] = row[self.task.config.ccdKey]
203 if "physical_filter" in datasetType.dimensions.names:
204 gen2id["filter"] = row["filter"]
205 # Translate that to Gen3.
206 gen3id, _ = translator(gen2id)
207 dataId = DataCoordinate.standardize(gen3id, graph=datasetType.dimensions)
208 ref = refsByDataId.get(dataId)
209 if ref is not None:
210 # Validity ranges must not overlap for the same dataID
211 # datasetType combination. Use that as a primary
212 # key and store the timespan and ref in a tuple
213 # as the value for later timespan validation.
214 timespansByDataId[(ref.dataId, ref.datasetType.name)].append((timespan, ref))
215 else:
216 # The Gen2 calib registry mentions this dataset, but it
217 # isn't included in what we've ingested. This might
218 # sometimes be a problem, but it should usually
219 # represent someone just trying to convert a subset of
220 # the Gen2 repo, so I don't think it's appropriate to
221 # warn or even log at info, since in that case there
222 # may be a _lot_ of these messages.
223 self.task.log.debug(
224 "Gen2 calibration registry entry has no dataset: %s for calibDate=%s, %s.",
225 datasetType.name, calibDate, dataId
226 )
228 # Analyze the timespans to check for overlap problems
229 # Gaps of a day should be closed since we assume differing
230 # conventions in gen2 repos.
232 # We need to correct any validity range issues and store the
233 # results in a dict-of-lists keyed by Timespan, since
234 # Registry.certify operates on one Timespan and multiple refs at a
235 # time.
236 refsByTimespan = defaultdict(list)
238 # A day with a bit of fuzz to indicate the largest gap we will close
239 max_gap = astropy.time.TimeDelta(1.001, format="jd", scale="tai")
241 # Since in many cases the validity ranges are relevant for multiple
242 # dataset types and dataIds we don't want to over-report and so
243 # cache the messages for later.
244 info_messages = set()
245 warn_messages = set()
246 for timespans in timespansByDataId.values():
247 # Sort all the timespans and check overlaps
248 sorted_timespans = sorted(timespans, key=lambda x: x[0])
249 timespan_prev, ref_prev = sorted_timespans.pop(0)
250 for timespan, ref in sorted_timespans:
251 # See if we have a suspicious gap
252 delta = timespan.begin - timespan_prev.end
253 abs_delta = abs(delta)
254 if abs_delta > 0 and abs_delta < max_gap:
255 if delta > 0:
256 # Gap between timespans
257 msg = f"Calibration validity gap closed from {timespan_prev.end} to {timespan.begin}"
258 info_messages.add(msg)
259 else:
260 # Overlap of timespans
261 msg = f"Calibration validity overlap of {abs(delta).to(u.s)} removed for period " \
262 f"{timespan.begin} to {timespan_prev.end}"
263 warn_messages.add(msg)
265 self.task.log.debug("Correcting validity range for %s with end %s",
266 ref_prev, timespan_prev.end)
268 # Assume this gap is down to convention in gen2.
269 # We have to adjust the previous timespan to fit
270 # since we always trust validStart.
271 timespan_prev = Timespan(begin=timespan_prev.begin,
272 end=timespan.begin)
273 # Store the previous timespan and ref since it has now
274 # been verified
275 refsByTimespan[timespan_prev].append(ref_prev)
277 # And update the previous values for the next iteration
278 timespan_prev = timespan
279 ref_prev = ref
281 # Store the final timespan/ref pair
282 refsByTimespan[timespan_prev].append(ref_prev)
284 # Issue any pending log messages we have recorded
285 for msg in sorted(info_messages):
286 self.task.log.info(msg)
287 for msg in sorted(warn_messages):
288 self.task.log.warn(msg)
290 # Done reading from Gen2, time to certify into Gen3.
291 for timespan, refs in refsByTimespan.items():
292 self.task.registry.certify(self.collection, refs, timespan)
294 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str:
295 # Docstring inherited from RepoConverter.
296 if calibDate is None:
297 return super().getRun(datasetTypeName)
298 else:
299 return self.instrument.makeCalibrationCollectionName(
300 *self._labels,
301 self.instrument.formatCollectionTimestamp(calibDate),
302 )
304 # Class attributes that will be shadowed by public instance attributes;
305 # defined here only for documentation purposes.
307 mapper: CameraMapper
308 """Gen2 mapper associated with this repository.
309 """