Coverage for python/lsst/obs/base/gen2to3/calibRepoConverter.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["CalibRepoConverter"]
25from collections import defaultdict
26import os
27import sqlite3
28from typing import TYPE_CHECKING, Dict, Iterator, List, Mapping, Sequence, Tuple, Optional
30import astropy.time
31import astropy.units as u
33from lsst.daf.butler import CollectionType, DataCoordinate, FileDataset, Timespan
34from .repoConverter import RepoConverter
35from .repoWalker import RepoWalker
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from lsst.daf.butler import DatasetType, StorageClass, FormatterParameter
39 from .repoWalker.scanner import PathElementHandler
40 from ..cameraMapper import CameraMapper
41 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
44class CalibRepoConverter(RepoConverter):
45 """A specialization of `RepoConverter` for calibration repositories.
47 Parameters
48 ----------
49 mapper : `CameraMapper`
50 Gen2 mapper for the data repository. The root associated with the
51 mapper is ignored and need not match the root of the repository.
52 labels : `Sequence` [ `str` ]
53 Strings injected into the names of the collections that calibration
54 datasets are written and certified into (forwarded as the ``extra``
55 argument to `Instrument` methods that generate collection names and
56 write curated calibrations).
57 **kwargs
58 Additional keyword arguments are forwarded to (and required by)
59 `RepoConverter`.
60 """
62 def __init__(self, *, mapper: CameraMapper, labels: Sequence[str] = (), **kwargs):
63 super().__init__(run=None, **kwargs)
64 self.mapper = mapper
65 self.collection = self.task.instrument.makeCalibrationCollectionName(*labels)
66 self._labels = tuple(labels)
67 self._datasetTypes = set()
69 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
70 # Docstring inherited from RepoConverter.
71 return datasetTypeName in self.instrument.getCuratedCalibrationNames()
73 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
74 # Docstring inherited from RepoConverter.
75 yield from self.mapper.calibrations.items()
77 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
78 storageClass: StorageClass, formatter: FormatterParameter = None,
79 targetHandler: Optional[PathElementHandler] = None,
80 ) -> RepoWalker.Target:
81 # Docstring inherited from RepoConverter.
82 target = RepoWalker.Target(
83 datasetTypeName=datasetTypeName,
84 storageClass=storageClass,
85 template=template,
86 keys=keys,
87 instrument=self.task.instrument.getName(),
88 universe=self.task.registry.dimensions,
89 formatter=formatter,
90 targetHandler=targetHandler,
91 translatorFactory=self.task.translatorFactory,
92 )
93 self._datasetTypes.add(target.datasetType)
94 return target
96 def _queryGen2CalibRegistry(self, db: sqlite3.Connection, datasetType: DatasetType, calibDate: str
97 ) -> Iterator[sqlite3.Row]:
98 """Query the Gen2 calibration registry for the validity ranges and
99 optionally detectors and filters associated with the given dataset type
100 and ``calibDate``.
102 Parameters
103 ----------
104 db : `sqlite3.Connection`
105 DBAPI connection to the Gen2 ``calibRegistry.sqlite3`` file.
106 datasetType : `DatasetType`
107 Gen3 dataset type being queried.
108 calibDate : `str`
109 String extracted from the ``calibDate`` template entry in Gen2
110 filenames.
112 Yields
113 ------
114 row : `sqlite3.Row`
115 SQLite result object; will have ``validStart`` and ``validEnd``
116 columns, may have a detector column (named
117 ``self.task.config.ccdKey``) and/or a ``filter`` column, depending
118 on whether ``datasetType.dimensions`` includes ``detector`` and
119 ``physical_filter``, respectively.
120 """
121 fields = ["validStart", "validEnd"]
122 if "detector" in datasetType.dimensions.names:
123 fields.append(self.task.config.ccdKey)
124 else:
125 fields.append(f"NULL AS {self.task.config.ccdKey}")
126 if "physical_filter" in datasetType.dimensions.names:
127 fields.append("filter")
128 else:
129 assert "band" not in datasetType.dimensions.names
130 fields.append("NULL AS filter")
131 tables = self.mapper.mappings[datasetType.name].tables
132 if tables is None or len(tables) == 0:
133 self.task.log.warn("Could not extract calibration ranges for %s in %s; "
134 "no tables in Gen2 mapper.",
135 datasetType.name, self.root, tables[0])
136 return
137 query = f"SELECT DISTINCT {', '.join(fields)} FROM {tables[0]} WHERE calibDate = ?;"
138 try:
139 results = db.execute(query, (calibDate,))
140 except sqlite3.OperationalError as e:
141 self.task.log.warn("Could not extract calibration ranges for %s in %s from table %s: %r",
142 datasetType.name, self.root, tables[0], e)
143 return
144 yield from results
146 def _finish(self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]],
147 count: int) -> None:
148 # Docstring inherited from RepoConverter.
149 # Read Gen2 calibration repository and extract validity ranges for
150 # all datasetType + calibDate combinations we ingested.
151 calibFile = os.path.join(self.root, "calibRegistry.sqlite3")
152 # If the registry file does not exist this indicates a problem.
153 # We check explicitly because sqlite will try to create the
154 # missing file if it can.
155 if not os.path.exists(calibFile):
156 raise RuntimeError("Attempting to convert calibrations but no registry database"
157 f" found in {self.root}")
159 # Initially we collate timespans for each dataId + dataset type
160 # combination. This allows us to check for small gaps or overlaps
161 # inherent in the ambiguous usage of validity ranges in gen2
162 timespansByDataId = defaultdict(list)
164 db = sqlite3.connect(calibFile)
165 db.row_factory = sqlite3.Row
167 with self.progress.bar(desc="Querying Gen2 calibRegistry", total=count) as progressBar:
168 for datasetType, datasetsByCalibDate in datasets.items():
169 if not datasetType.isCalibration():
170 continue
171 gen2keys = {}
172 if "detector" in datasetType.dimensions.names:
173 gen2keys[self.task.config.ccdKey] = int
174 if "physical_filter" in datasetType.dimensions.names:
175 gen2keys["filter"] = str
176 translator = self.instrument.makeDataIdTranslatorFactory().makeMatching(
177 datasetType.name,
178 gen2keys,
179 instrument=self.instrument.getName()
180 )
181 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items():
182 assert calibDate is not None, ("datasetType.isCalibration() is set by "
183 "the presence of calibDate in the Gen2 template")
184 # Build a mapping that lets us find DatasetRefs by data ID,
185 # for this DatasetType and calibDate. We know there is
186 # only one ref for each data ID (given DatasetType and
187 # calibDate as well).
188 refsByDataId = {}
189 for dataset in datasetsForCalibDate:
190 refsByDataId.update((ref.dataId, ref) for ref in dataset.refs)
191 # Query the Gen2 calibration repo for the validity ranges
192 # for this DatasetType and calibDate, and look up the
193 # appropriate refs by data ID.
194 for row in self._queryGen2CalibRegistry(db, datasetType, calibDate):
195 # For validity times we use TAI as some gen2 repos have
196 # validity dates very far in the past or future.
197 timespan = Timespan(
198 astropy.time.Time(row["validStart"], format="iso", scale="tai"),
199 astropy.time.Time(row["validEnd"], format="iso", scale="tai"),
200 )
201 # Make a Gen2 data ID from query results.
202 gen2id = {}
203 if "detector" in datasetType.dimensions.names:
204 gen2id[self.task.config.ccdKey] = row[self.task.config.ccdKey]
205 if "physical_filter" in datasetType.dimensions.names:
206 gen2id["filter"] = row["filter"]
207 # Translate that to Gen3.
208 gen3id, _ = translator(gen2id)
209 dataId = DataCoordinate.standardize(gen3id, graph=datasetType.dimensions)
210 ref = refsByDataId.get(dataId)
211 if ref is not None:
212 # Validity ranges must not overlap for the same
213 # dataID datasetType combination. Use that as a
214 # primary key and store the timespan and ref in a
215 # tuple as the value for later timespan validation.
216 timespansByDataId[(ref.dataId, ref.datasetType.name)].append((timespan, ref))
217 else:
218 # The Gen2 calib registry mentions this dataset,
219 # but it isn't included in what we've ingested.
220 # This might sometimes be a problem, but it should
221 # usually represent someone just trying to convert
222 # a subset of the Gen2 repo, so I don't think it's
223 # appropriate to warn or even log at info, since in
224 # that case there may be a _lot_ of these messages.
225 self.task.log.debug(
226 "Gen2 calibration registry entry has no dataset: %s for calibDate=%s, %s.",
227 datasetType.name, calibDate, dataId
228 )
229 progressBar.update(len(datasetsForCalibDate))
231 # Analyze the timespans to check for overlap problems
232 # Gaps of a day should be closed since we assume differing
233 # conventions in gen2 repos.
235 # We need to correct any validity range issues and store the
236 # results in a dict-of-lists keyed by Timespan, since
237 # Registry.certify operates on one Timespan and multiple refs at a
238 # time.
239 refsByTimespan = defaultdict(list)
241 # A day with a bit of fuzz to indicate the largest gap we will close
242 max_gap = astropy.time.TimeDelta(1.001, format="jd", scale="tai")
244 # Since in many cases the validity ranges are relevant for multiple
245 # dataset types and dataIds we don't want to over-report and so
246 # cache the messages for later.
247 info_messages = set()
248 warn_messages = set()
249 for timespans in self.progress.wrap(timespansByDataId.values(), desc="Fixing validity ranges"):
250 # Sort all the timespans and check overlaps
251 sorted_timespans = sorted(timespans, key=lambda x: x[0])
252 timespan_prev, ref_prev = sorted_timespans.pop(0)
253 for timespan, ref in sorted_timespans:
254 # See if we have a suspicious gap
255 delta = timespan.begin - timespan_prev.end
256 abs_delta = abs(delta)
257 if abs_delta > 0 and abs_delta < max_gap:
258 if delta > 0:
259 # Gap between timespans
260 msg = f"Calibration validity gap closed from {timespan_prev.end} to {timespan.begin}"
261 info_messages.add(msg)
262 else:
263 # Overlap of timespans
264 msg = f"Calibration validity overlap of {abs(delta).to(u.s)} removed for period " \
265 f"{timespan.begin} to {timespan_prev.end}"
266 warn_messages.add(msg)
268 self.task.log.debug("Correcting validity range for %s with end %s",
269 ref_prev, timespan_prev.end)
271 # Assume this gap is down to convention in gen2.
272 # We have to adjust the previous timespan to fit
273 # since we always trust validStart.
274 timespan_prev = Timespan(begin=timespan_prev.begin,
275 end=timespan.begin)
276 # Store the previous timespan and ref since it has now
277 # been verified
278 refsByTimespan[timespan_prev].append(ref_prev)
280 # And update the previous values for the next iteration
281 timespan_prev = timespan
282 ref_prev = ref
284 # Store the final timespan/ref pair
285 refsByTimespan[timespan_prev].append(ref_prev)
287 # Issue any pending log messages we have recorded
288 for msg in sorted(info_messages):
289 self.task.log.info(msg)
290 for msg in sorted(warn_messages):
291 self.task.log.warn(msg)
293 # Done reading from Gen2, time to certify into Gen3.
294 self.task.registry.registerCollection(self.collection, type=CollectionType.CALIBRATION)
295 for timespan, refs in refsByTimespan.items():
296 self.task.registry.certify(self.collection, refs, timespan)
298 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str:
299 # Docstring inherited from RepoConverter.
300 if calibDate is None:
301 return super().getRun(datasetTypeName)
302 else:
303 return self.instrument.makeCalibrationCollectionName(
304 *self._labels,
305 self.instrument.formatCollectionTimestamp(calibDate),
306 )
308 # Class attributes that will be shadowed by public instance attributes;
309 # defined here only for documentation purposes.
311 mapper: CameraMapper
312 """Gen2 mapper associated with this repository.
313 """