Coverage for python/lsst/obs/base/gen2to3/calibRepoConverter.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["CalibRepoConverter"]
25from collections import defaultdict
26import os
27import sqlite3
28from typing import TYPE_CHECKING, Dict, Iterator, List, Mapping, Tuple, Optional
30import astropy.time
31import astropy.units as u
33from lsst.daf.butler import DataCoordinate, FileDataset, Timespan
34from .repoConverter import RepoConverter
35from .repoWalker import RepoWalker
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from lsst.daf.butler import DatasetType, StorageClass, FormatterParameter
39 from .repoWalker.scanner import PathElementHandler
40 from ..cameraMapper import CameraMapper
41 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
44class CalibRepoConverter(RepoConverter):
45 """A specialization of `RepoConverter` for calibration repositories.
47 Parameters
48 ----------
49 mapper : `CameraMapper`
50 Gen2 mapper for the data repository. The root associated with the
51 mapper is ignored and need not match the root of the repository.
52 kwds
53 Additional keyword arguments are forwarded to (and required by)
54 `RepoConverter`.
55 """
57 def __init__(self, *, mapper: CameraMapper, collection: str, **kwds):
58 super().__init__(run=None, **kwds)
59 self.mapper = mapper
60 self.collection = collection
61 self._datasetTypes = set()
63 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
64 # Docstring inherited from RepoConverter.
65 return datasetTypeName in self.instrument.getCuratedCalibrationNames()
67 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
68 # Docstring inherited from RepoConverter.
69 yield from self.mapper.calibrations.items()
71 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
72 storageClass: StorageClass, formatter: FormatterParameter = None,
73 targetHandler: Optional[PathElementHandler] = None,
74 ) -> RepoWalker.Target:
75 # Docstring inherited from RepoConverter.
76 target = RepoWalker.Target(
77 datasetTypeName=datasetTypeName,
78 storageClass=storageClass,
79 template=template,
80 keys=keys,
81 instrument=self.task.instrument.getName(),
82 universe=self.task.registry.dimensions,
83 formatter=formatter,
84 targetHandler=targetHandler,
85 translatorFactory=self.task.translatorFactory,
86 )
87 self._datasetTypes.add(target.datasetType)
88 return target
90 def _queryGen2CalibRegistry(self, db: sqlite3.Connection, datasetType: DatasetType, calibDate: str
91 ) -> Iterator[sqlite3.Row]:
92 # TODO: docs
93 fields = ["validStart", "validEnd"]
94 if "detector" in datasetType.dimensions.names:
95 fields.append(self.task.config.ccdKey)
96 else:
97 fields.append(f"NULL AS {self.task.config.ccdKey}")
98 if "physical_filter" in datasetType.dimensions.names:
99 fields.append("filter")
100 else:
101 assert "band" not in datasetType.dimensions.names
102 fields.append("NULL AS filter")
103 tables = self.mapper.mappings[datasetType.name].tables
104 if tables is None or len(tables) == 0:
105 self.task.log.warn("Could not extract calibration ranges for %s in %s; "
106 "no tables in Gen2 mapper.",
107 datasetType.name, self.root, tables[0])
108 return
109 query = f"SELECT DISTINCT {', '.join(fields)} FROM {tables[0]} WHERE calibDate = ?;"
110 try:
111 results = db.execute(query, (calibDate,))
112 except sqlite3.OperationalError as e:
113 self.task.log.warn("Could not extract calibration ranges for %s in %s from table %s: %r",
114 datasetType.name, self.root, tables[0], e)
115 return
116 yield from results
118 def _finish(self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]]):
119 # Read Gen2 calibration repository and extract validity ranges for
120 # all datasetType + calibDate combinations we ingested.
121 calibFile = os.path.join(self.root, "calibRegistry.sqlite3")
122 # If the registry file does not exist this indicates a problem.
123 # We check explicitly because sqlite will try to create the
124 # missing file if it can.
125 if not os.path.exists(calibFile):
126 raise RuntimeError("Attempting to convert calibrations but no registry database"
127 f" found in {self.root}")
129 # Initially we collate timespans for each dataId + dataset type
130 # combination. This allows us to check for small gaps or overlaps
131 # inherent in the ambiguous usage of validity ranges in gen2
132 timespansByDataId = defaultdict(list)
134 db = sqlite3.connect(calibFile)
135 db.row_factory = sqlite3.Row
137 for datasetType, datasetsByCalibDate in datasets.items():
138 if not datasetType.isCalibration():
139 continue
140 gen2keys = {}
141 if "detector" in datasetType.dimensions.names:
142 gen2keys[self.task.config.ccdKey] = int
143 if "physical_filter" in datasetType.dimensions.names:
144 gen2keys["filter"] = str
145 translator = self.instrument.makeDataIdTranslatorFactory().makeMatching(
146 datasetType.name,
147 gen2keys,
148 instrument=self.instrument.getName()
149 )
150 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items():
151 assert calibDate is not None, ("datasetType.isCalibration() is set by "
152 "the presence of calibDate in the Gen2 template")
153 # Build a mapping that lets us find DatasetRefs by data ID,
154 # for this DatasetType and calibDate. We know there is only
155 # one ref for each data ID (given DatasetType and calibDate as
156 # well).
157 refsByDataId = {}
158 for dataset in datasetsForCalibDate:
159 refsByDataId.update((ref.dataId, ref) for ref in dataset.refs)
160 # Query the Gen2 calibration repo for the validity ranges for
161 # this DatasetType and calibDate, and look up the appropriate
162 # refs by data ID.
163 for row in self._queryGen2CalibRegistry(db, datasetType, calibDate):
164 # For validity times we use TAI as some gen2 repos have validity
165 # dates very far in the past or future.
166 timespan = Timespan(
167 astropy.time.Time(row["validStart"], format="iso", scale="tai"),
168 astropy.time.Time(row["validEnd"], format="iso", scale="tai"),
169 )
170 # Make a Gen2 data ID from query results.
171 gen2id = {}
172 if "detector" in datasetType.dimensions.names:
173 gen2id[self.task.config.ccdKey] = row[self.task.config.ccdKey]
174 if "physical_filter" in datasetType.dimensions.names:
175 gen2id["filter"] = row["filter"]
176 # Translate that to Gen3.
177 gen3id, _ = translator(gen2id)
178 dataId = DataCoordinate.standardize(gen3id, graph=datasetType.dimensions)
179 ref = refsByDataId.get(dataId)
180 if ref is not None:
181 # Validity ranges must not overlap for the same dataID
182 # datasetType combination. Use that as a primary
183 # key and store the timespan and ref in a tuple
184 # as the value for later timespan validation.
185 timespansByDataId[(ref.dataId, ref.datasetType.name)].append((timespan, ref))
186 else:
187 # The Gen2 calib registry mentions this dataset, but it
188 # isn't included in what we've ingested. This might
189 # sometimes be a problem, but it should usually
190 # represent someone just trying to convert a subset of
191 # the Gen2 repo, so I don't think it's appropriate to
192 # warn or even log at info, since in that case there
193 # may be a _lot_ of these messages.
194 self.task.log.debug(
195 "Gen2 calibration registry entry has no dataset: %s for calibDate=%s, %s.",
196 datasetType.name, calibDate, dataId
197 )
199 # Analyze the timespans to check for overlap problems
200 # Gaps of a day should be closed since we assume differing
201 # conventions in gen2 repos.
203 # We need to correct any validity range issues and store the
204 # results in a dict-of-lists keyed by Timespan, since
205 # Registry.certify operates on one Timespan and multiple refs at a
206 # time.
207 refsByTimespan = defaultdict(list)
209 # A day with a bit of fuzz to indicate the largest gap we will close
210 max_gap = astropy.time.TimeDelta(1.001, format="jd", scale="tai")
212 # Since in many cases the validity ranges are relevant for multiple
213 # dataset types and dataIds we don't want to over-report and so
214 # cache the messages for later.
215 info_messages = set()
216 warn_messages = set()
217 for timespans in timespansByDataId.values():
218 # Sort all the timespans and check overlaps
219 sorted_timespans = sorted(timespans, key=lambda x: x[0])
220 timespan_prev, ref_prev = sorted_timespans.pop(0)
221 for timespan, ref in sorted_timespans:
222 # See if we have a suspicious gap
223 delta = timespan.begin - timespan_prev.end
224 abs_delta = abs(delta)
225 if abs_delta > 0 and abs_delta < max_gap:
226 if delta > 0:
227 # Gap between timespans
228 msg = f"Calibration validity gap closed from {timespan_prev.end} to {timespan.begin}"
229 info_messages.add(msg)
230 else:
231 # Overlap of timespans
232 msg = f"Calibration validity overlap of {abs(delta).to(u.s)} removed for period " \
233 f"{timespan.begin} to {timespan_prev.end}"
234 warn_messages.add(msg)
236 self.task.log.debug("Correcting validity range for %s with end %s",
237 ref_prev, timespan_prev.end)
239 # Assume this gap is down to convention in gen2.
240 # We have to adjust the previous timespan to fit
241 # since we always trust validStart.
242 timespan_prev = Timespan(begin=timespan_prev.begin,
243 end=timespan.begin)
244 # Store the previous timespan and ref since it has now
245 # been verified
246 refsByTimespan[timespan_prev].append(ref_prev)
248 # And update the previous values for the next iteration
249 timespan_prev = timespan
250 ref_prev = ref
252 # Store the final timespan/ref pair
253 refsByTimespan[timespan_prev].append(ref_prev)
255 # Issue any pending log messages we have recorded
256 for msg in sorted(info_messages):
257 self.task.log.info(msg)
258 for msg in sorted(warn_messages):
259 self.task.log.warn(msg)
261 # Done reading from Gen2, time to certify into Gen3.
262 for timespan, refs in refsByTimespan.items():
263 self.task.registry.certify(self.collection, refs, timespan)
265 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str:
266 if calibDate is None:
267 return super().getRun(datasetTypeName)
268 else:
269 return self.instrument.makeCollectionName("calib", "gen2", calibDate)
271 # Class attributes that will be shadowed by public instance attributes;
272 # defined here only for documentation purposes.
274 mapper: CameraMapper
275 """Gen2 mapper associated with this repository.
276 """