Coverage for python/lsst/obs/base/gen2to3/calibRepoConverter.py: 14%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

122 statements  

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["CalibRepoConverter"] 

24 

25from collections import defaultdict 

26import os 

27import sqlite3 

28from typing import TYPE_CHECKING, Dict, Iterator, List, Mapping, Sequence, Tuple, Optional 

29 

30import astropy.time 

31import astropy.units as u 

32 

33from lsst.daf.butler import CollectionType, DataCoordinate, FileDataset, Timespan 

34from .repoConverter import RepoConverter 

35from .repoWalker import RepoWalker 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from lsst.daf.butler import DatasetType, StorageClass, FormatterParameter 

39 from .repoWalker.scanner import PathElementHandler 

40 from ..cameraMapper import CameraMapper 

41 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping 

42 

43 

44class CalibRepoConverter(RepoConverter): 

45 """A specialization of `RepoConverter` for calibration repositories. 

46 

47 Parameters 

48 ---------- 

49 mapper : `CameraMapper` 

50 Gen2 mapper for the data repository. The root associated with the 

51 mapper is ignored and need not match the root of the repository. 

52 labels : `Sequence` [ `str` ] 

53 Strings injected into the names of the collections that calibration 

54 datasets are written and certified into (forwarded as the ``extra`` 

55 argument to `Instrument` methods that generate collection names and 

56 write curated calibrations). 

57 **kwargs 

58 Additional keyword arguments are forwarded to (and required by) 

59 `RepoConverter`. 

60 """ 

61 

62 def __init__(self, *, mapper: CameraMapper, labels: Sequence[str] = (), **kwargs): 

63 super().__init__(run=None, **kwargs) 

64 self.mapper = mapper 

65 self.collection = self.task.instrument.makeCalibrationCollectionName(*labels) 

66 self._labels = tuple(labels) 

67 self._datasetTypes = set() 

68 

69 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

70 # Docstring inherited from RepoConverter. 

71 return datasetTypeName in self.instrument.getCuratedCalibrationNames() 

72 

73 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]: 

74 # Docstring inherited from RepoConverter. 

75 yield from self.mapper.calibrations.items() 

76 

77 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type], 

78 storageClass: StorageClass, formatter: FormatterParameter = None, 

79 targetHandler: Optional[PathElementHandler] = None, 

80 ) -> RepoWalker.Target: 

81 # Docstring inherited from RepoConverter. 

82 target = RepoWalker.Target( 

83 datasetTypeName=datasetTypeName, 

84 storageClass=storageClass, 

85 template=template, 

86 keys=keys, 

87 instrument=self.task.instrument.getName(), 

88 universe=self.task.registry.dimensions, 

89 formatter=formatter, 

90 targetHandler=targetHandler, 

91 translatorFactory=self.task.translatorFactory, 

92 ) 

93 self._datasetTypes.add(target.datasetType) 

94 return target 

95 

96 def _queryGen2CalibRegistry(self, db: sqlite3.Connection, datasetType: DatasetType, calibDate: str 

97 ) -> Iterator[sqlite3.Row]: 

98 """Query the Gen2 calibration registry for the validity ranges and 

99 optionally detectors and filters associated with the given dataset type 

100 and ``calibDate``. 

101 

102 Parameters 

103 ---------- 

104 db : `sqlite3.Connection` 

105 DBAPI connection to the Gen2 ``calibRegistry.sqlite3`` file. 

106 datasetType : `DatasetType` 

107 Gen3 dataset type being queried. 

108 calibDate : `str` 

109 String extracted from the ``calibDate`` template entry in Gen2 

110 filenames. 

111 

112 Yields 

113 ------ 

114 row : `sqlite3.Row` 

115 SQLite result object; will have ``validStart`` and ``validEnd`` 

116 columns, may have a detector column (named 

117 ``self.task.config.ccdKey``) and/or a ``filter`` column, depending 

118 on whether ``datasetType.dimensions`` includes ``detector`` and 

119 ``physical_filter``, respectively. 

120 """ 

121 fields = ["validStart", "validEnd"] 

122 if "detector" in datasetType.dimensions.names: 

123 fields.append(self.task.config.ccdKey) 

124 else: 

125 fields.append(f"NULL AS {self.task.config.ccdKey}") 

126 if "physical_filter" in datasetType.dimensions.names: 

127 fields.append("filter") 

128 else: 

129 assert "band" not in datasetType.dimensions.names 

130 fields.append("NULL AS filter") 

131 tables = self.mapper.mappings[datasetType.name].tables 

132 if tables is None or len(tables) == 0: 

133 self.task.log.warning("Could not extract calibration ranges for %s in %s; " 

134 "no tables in Gen2 mapper.", 

135 datasetType.name, self.root, tables[0]) 

136 return 

137 query = f"SELECT DISTINCT {', '.join(fields)} FROM {tables[0]} WHERE calibDate = ?;" 

138 try: 

139 results = db.execute(query, (calibDate,)) 

140 except sqlite3.OperationalError as e: 

141 self.task.log.warning("Could not extract calibration ranges for %s in %s from table %s: %r", 

142 datasetType.name, self.root, tables[0], e) 

143 return 

144 yield from results 

145 

146 def _finish(self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], 

147 count: int) -> None: 

148 # Docstring inherited from RepoConverter. 

149 # Read Gen2 calibration repository and extract validity ranges for 

150 # all datasetType + calibDate combinations we ingested. 

151 calibFile = os.path.join(self.root, "calibRegistry.sqlite3") 

152 # If the registry file does not exist this indicates a problem. 

153 # We check explicitly because sqlite will try to create the 

154 # missing file if it can. 

155 if not os.path.exists(calibFile): 

156 raise RuntimeError("Attempting to convert calibrations but no registry database" 

157 f" found in {self.root}") 

158 

159 # Initially we collate timespans for each dataId + dataset type 

160 # combination. This allows us to check for small gaps or overlaps 

161 # inherent in the ambiguous usage of validity ranges in gen2 

162 timespansByDataId = defaultdict(list) 

163 

164 db = sqlite3.connect(calibFile) 

165 db.row_factory = sqlite3.Row 

166 

167 with self.progress.bar(desc="Querying Gen2 calibRegistry", total=count) as progressBar: 

168 for datasetType, datasetsByCalibDate in datasets.items(): 

169 if not datasetType.isCalibration(): 

170 continue 

171 gen2keys = {} 

172 if "detector" in datasetType.dimensions.names: 

173 gen2keys[self.task.config.ccdKey] = int 

174 if "physical_filter" in datasetType.dimensions.names: 

175 gen2keys["filter"] = str 

176 translator = self.instrument.makeDataIdTranslatorFactory().makeMatching( 

177 datasetType.name, 

178 gen2keys, 

179 instrument=self.instrument.getName() 

180 ) 

181 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items(): 

182 assert calibDate is not None, ("datasetType.isCalibration() is set by " 

183 "the presence of calibDate in the Gen2 template") 

184 # Build a mapping that lets us find DatasetRefs by data ID, 

185 # for this DatasetType and calibDate. We know there is 

186 # only one ref for each data ID (given DatasetType and 

187 # calibDate as well). 

188 refsByDataId = {} 

189 for dataset in datasetsForCalibDate: 

190 refsByDataId.update((ref.dataId, ref) for ref in dataset.refs) 

191 # Query the Gen2 calibration repo for the validity ranges 

192 # for this DatasetType and calibDate, and look up the 

193 # appropriate refs by data ID. 

194 for row in self._queryGen2CalibRegistry(db, datasetType, calibDate): 

195 # For validity times we use TAI as some gen2 repos have 

196 # validity dates very far in the past or future. 

197 timespan = Timespan( 

198 astropy.time.Time(row["validStart"], format="iso", scale="tai"), 

199 astropy.time.Time(row["validEnd"], format="iso", scale="tai"), 

200 ) 

201 # Make a Gen2 data ID from query results. 

202 gen2id = {} 

203 if "detector" in datasetType.dimensions.names: 

204 gen2id[self.task.config.ccdKey] = row[self.task.config.ccdKey] 

205 if "physical_filter" in datasetType.dimensions.names: 

206 gen2id["filter"] = row["filter"] 

207 # Translate that to Gen3. 

208 gen3id, _ = translator(gen2id) 

209 dataId = DataCoordinate.standardize(gen3id, graph=datasetType.dimensions) 

210 ref = refsByDataId.get(dataId) 

211 if ref is not None: 

212 # Validity ranges must not overlap for the same 

213 # dataID datasetType combination. Use that as a 

214 # primary key and store the timespan and ref in a 

215 # tuple as the value for later timespan validation. 

216 timespansByDataId[(ref.dataId, ref.datasetType.name)].append((timespan, ref)) 

217 else: 

218 # The Gen2 calib registry mentions this dataset, 

219 # but it isn't included in what we've ingested. 

220 # This might sometimes be a problem, but it should 

221 # usually represent someone just trying to convert 

222 # a subset of the Gen2 repo, so I don't think it's 

223 # appropriate to warn or even log at info, since in 

224 # that case there may be a _lot_ of these messages. 

225 self.task.log.debug( 

226 "Gen2 calibration registry entry has no dataset: %s for calibDate=%s, %s.", 

227 datasetType.name, calibDate, dataId 

228 ) 

229 progressBar.update(len(datasetsForCalibDate)) 

230 

231 # Analyze the timespans to check for overlap problems 

232 # Gaps of a day should be closed since we assume differing 

233 # conventions in gen2 repos. 

234 

235 # We need to correct any validity range issues and store the 

236 # results in a dict-of-lists keyed by Timespan, since 

237 # Registry.certify operates on one Timespan and multiple refs at a 

238 # time. 

239 refsByTimespan = defaultdict(list) 

240 

241 # A day with a bit of fuzz to indicate the largest gap we will close 

242 max_gap = astropy.time.TimeDelta(1.001, format="jd", scale="tai") 

243 

244 # Since in many cases the validity ranges are relevant for multiple 

245 # dataset types and dataIds we don't want to over-report and so 

246 # cache the messages for later. 

247 info_messages = set() 

248 warn_messages = set() 

249 for timespans in self.progress.wrap(timespansByDataId.values(), desc="Fixing validity ranges"): 

250 # Sort all the timespans and check overlaps 

251 sorted_timespans = sorted(timespans, key=lambda x: x[0]) 

252 timespan_prev, ref_prev = sorted_timespans.pop(0) 

253 for timespan, ref in sorted_timespans: 

254 # See if we have a suspicious gap 

255 delta = timespan.begin - timespan_prev.end 

256 abs_delta = abs(delta) 

257 if abs_delta > 0 and abs_delta < max_gap: 

258 if delta > 0: 

259 # Gap between timespans 

260 msg = f"Calibration validity gap closed from {timespan_prev.end} to {timespan.begin}" 

261 info_messages.add(msg) 

262 else: 

263 # Overlap of timespans 

264 msg = f"Calibration validity overlap of {abs(delta).to(u.s)} removed for period " \ 

265 f"{timespan.begin} to {timespan_prev.end}" 

266 warn_messages.add(msg) 

267 

268 self.task.log.debug("Correcting validity range for %s with end %s", 

269 ref_prev, timespan_prev.end) 

270 

271 # Assume this gap is down to convention in gen2. 

272 # We have to adjust the previous timespan to fit 

273 # since we always trust validStart. 

274 timespan_prev = Timespan(begin=timespan_prev.begin, 

275 end=timespan.begin) 

276 # Store the previous timespan and ref since it has now 

277 # been verified 

278 refsByTimespan[timespan_prev].append(ref_prev) 

279 

280 # And update the previous values for the next iteration 

281 timespan_prev = timespan 

282 ref_prev = ref 

283 

284 # Store the final timespan/ref pair 

285 refsByTimespan[timespan_prev].append(ref_prev) 

286 

287 # Issue any pending log messages we have recorded 

288 for msg in sorted(info_messages): 

289 self.task.log.info(msg) 

290 for msg in sorted(warn_messages): 

291 self.task.log.warning(msg) 

292 

293 # Done reading from Gen2, time to certify into Gen3. 

294 self.task.registry.registerCollection(self.collection, type=CollectionType.CALIBRATION) 

295 for timespan, refs in refsByTimespan.items(): 

296 self.task.registry.certify(self.collection, refs, timespan) 

297 

298 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str: 

299 # Docstring inherited from RepoConverter. 

300 if calibDate is None: 

301 return super().getRun(datasetTypeName) 

302 else: 

303 return self.instrument.makeCalibrationCollectionName( 

304 *self._labels, 

305 self.instrument.formatCollectionTimestamp(calibDate), 

306 ) 

307 

308 # Class attributes that will be shadowed by public instance attributes; 

309 # defined here only for documentation purposes. 

310 

311 mapper: CameraMapper 

312 """Gen2 mapper associated with this repository. 

313 """