Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from typing import Iterable, Optional, List, Dict 

29 

30from lsst.utils import doImport 

31from lsst.daf.butler import ( 

32 Butler as Butler3, 

33 SkyPixDimension 

34) 

35from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

36from lsst.pipe.base import Task 

37from lsst.skymap import skyMapRegistry, BaseSkyMap 

38 

39from ..ingest import RawIngestTask 

40from .repoConverter import ConversionSubset 

41from .rootRepoConverter import RootRepoConverter 

42from .calibRepoConverter import CalibRepoConverter 

43from .standardRepoConverter import StandardRepoConverter 

44 

45 

46@dataclass 

47class ConfiguredSkyMap: 

48 """Struct containing information about a skymap that may appear in a Gen2 

49 repository. 

50 """ 

51 

52 name: str 

53 """Name of the skymap used in Gen3 data IDs. 

54 """ 

55 

56 sha1: bytes 

57 """Hash computed by `BaseSkyMap.getSha1`. 

58 """ 

59 

60 instance: BaseSkyMap 

61 """Name of the skymap used in Gen3 data IDs. 

62 """ 

63 

64 used: bool = False 

65 """Whether this skymap has been found in at least one repository being 

66 converted. 

67 """ 

68 

69 

70class ConvertRepoSkyMapConfig(Config): 

71 """Sub-config used to hold the parameters of a SkyMap. 

72 

73 Notes 

74 ----- 

75 This config only needs to exist because we can't put a 

76 `~lsst.pex.config.RegistryField` directly inside a 

77 `~lsst.pex.config.ConfigDictField`. 

78 

79 It needs to have its only field named "skyMap" for compatibility with the 

80 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

81 use one config file in an obs package to configure both. 

82 

83 This name leads to unfortunate repetition with the field named 

84 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

85 unavoidable. 

86 """ 

87 skyMap = skyMapRegistry.makeField( 

88 doc="Type and parameters for the SkyMap itself.", 

89 default="dodeca", 

90 ) 

91 

92 

93class ConvertRepoConfig(Config): 

94 raws = ConfigurableField( 

95 "Configuration for subtask responsible for ingesting raws and adding " 

96 "visit and exposure dimension entries.", 

97 target=RawIngestTask, 

98 ) 

99 skyMaps = ConfigDictField( 

100 "Mapping from Gen3 skymap name to the parameters used to construct a " 

101 "BaseSkyMap instance. This will be used to associate names with " 

102 "existing skymaps found in the Gen2 repo.", 

103 keytype=str, 

104 itemtype=ConvertRepoSkyMapConfig, 

105 default={} 

106 ) 

107 rootSkyMapName = Field( 

108 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

109 "datasets in the root repository when no SkyMap is found there. ", 

110 dtype=str, 

111 optional=True, 

112 default=None, 

113 ) 

114 collections = DictField( 

115 "Special collections (values) for certain dataset types (keys). " 

116 "These are used in addition to rerun collections for datasets in " 

117 "reruns. The 'raw' dataset must have an entry here if it is to be " 

118 "converted.", 

119 keytype=str, 

120 itemtype=str, 

121 default={ 

122 "deepCoadd_skyMap": "skymaps", 

123 "brightObjectMask": "masks", 

124 } 

125 ) 

126 storageClasses = DictField( 

127 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

128 "or 'persistable') to the Gen3 StorageClass name.", 

129 keytype=str, 

130 itemtype=str, 

131 default={ 

132 "bias": "ExposureF", 

133 "dark": "ExposureF", 

134 "flat": "ExposureF", 

135 "defects": "Defects", 

136 "BaseSkyMap": "SkyMap", 

137 "BaseCatalog": "Catalog", 

138 "BackgroundList": "Background", 

139 "raw": "Exposure", 

140 "MultilevelParquetTable": "DataFrame", 

141 "ParquetTable": "DataFrame", 

142 "SkyWcs": "Wcs", 

143 } 

144 ) 

145 doRegisterInstrument = Field( 

146 "If True (default), add dimension records for the Instrument and its " 

147 "filters and detectors to the registry instead of assuming they are " 

148 "already present.", 

149 dtype=bool, 

150 default=True, 

151 ) 

152 doWriteCuratedCalibrations = Field( 

153 "If True (default), ingest human-curated calibrations directly via " 

154 "the Instrument interface. Note that these calibrations are never " 

155 "converted from Gen2 repositories.", 

156 dtype=bool, 

157 default=True, 

158 ) 

159 refCats = ListField( 

160 "The names of reference catalogs (subdirectories under ref_cats) to " 

161 "be converted", 

162 dtype=str, 

163 default=[] 

164 ) 

165 fileIgnorePatterns = ListField( 

166 "Filename globs that should be ignored instead of being treated as " 

167 "datasets.", 

168 dtype=str, 

169 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

170 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

171 "_parent", "repositoryCfg.yaml"] 

172 ) 

173 datasetIncludePatterns = ListField( 

174 "Glob-style patterns for dataset type names that should be converted.", 

175 dtype=str, 

176 default=["*"] 

177 ) 

178 datasetIgnorePatterns = ListField( 

179 "Glob-style patterns for dataset type names that should not be " 

180 "converted despite matching a pattern in datasetIncludePatterns.", 

181 dtype=str, 

182 default=[] 

183 ) 

184 ccdKey = Field( 

185 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

186 dtype=str, 

187 default="ccd", 

188 ) 

189 relatedOnly = Field( 

190 "If True (default), only convert datasets that are related to the " 

191 "ingested visits. Ignored unless a list of visits is passed to " 

192 "run().", 

193 dtype=bool, 

194 default=False, 

195 ) 

196 curatedCalibrations = ListField( 

197 "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` " 

198 "and thus should not be converted using the standard calibration " 

199 "conversion system.", 

200 dtype=str, 

201 default=["camera", 

202 "transmission_sensor", 

203 "transmission_filter", 

204 "transmission_optics", 

205 "transmission_atmosphere", 

206 "bfKernel"] 

207 ) 

208 

209 @property 

210 def transfer(self): 

211 return self.raws.transfer 

212 

213 @transfer.setter 

214 def transfer(self, value): 

215 self.raws.transfer = value 

216 

217 @property 

218 def instrument(self): 

219 return self.raws.instrument 

220 

221 @instrument.setter 

222 def instrument(self, value): 

223 self.raws.instrument = value 

224 

225 def setDefaults(self): 

226 self.transfer = None 

227 

228 # TODO: check that there are no collection overrides for curated 

229 # calibrations, since we don't have a good way to utilize them. 

230 

231 

232class ConvertRepoTask(Task): 

233 """A task that converts one or more related Gen2 data repositories to a 

234 single Gen3 data repository (with multiple collections). 

235 

236 Parameters 

237 ---------- 

238 config: `ConvertRepoConfig` 

239 Configuration for this task. 

240 butler3: `lsst.daf.butler.Butler` 

241 Gen3 Butler instance that represents the data repository datasets will 

242 be ingested into. The collection and/or run associated with this 

243 Butler will be ignored in favor of collections/runs passed via config 

244 or to `run`. 

245 kwds 

246 Other keyword arguments are forwarded to the `Task` constructor. 

247 

248 Notes 

249 ----- 

250 Most of the work of converting repositories is delegated to instances of 

251 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

252 only state that is relevant for all Gen2 repositories being ingested, while 

253 each `RepoConverter` instance holds only state relevant for the conversion 

254 of a single Gen2 repository. Both the task and the `RepoConverter` 

255 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

256 methods may only be called once on a particular instance. 

257 """ 

258 

259 ConfigClass = ConvertRepoConfig 

260 

261 _DefaultName = "convertRepo" 

262 

263 def __init__(self, config=None, *, butler3: Butler3, **kwds): 

264 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

265 super().__init__(config, **kwds) 

266 self.butler3 = butler3 

267 self.registry = self.butler3.registry 

268 self.universe = self.registry.dimensions 

269 if self.isDatasetTypeIncluded("raw"): 

270 self.makeSubtask("raws", butler=butler3) 

271 self.instrument = self.raws.instrument 

272 else: 

273 self.raws = None 

274 self.instrument = doImport(self.config.instrument)() 

275 self._configuredSkyMapsBySha1 = {} 

276 self._configuredSkyMapsByName = {} 

277 for name, config in self.config.skyMaps.items(): 

278 instance = config.skyMap.apply() 

279 self._populateSkyMapDicts(name, instance) 

280 self._usedSkyPix = set() 

281 

282 def _populateSkyMapDicts(self, name, instance): 

283 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

284 self._configuredSkyMapsBySha1[struct.sha1] = struct 

285 self._configuredSkyMapsByName[struct.name] = struct 

286 

287 def isDatasetTypeIncluded(self, datasetTypeName: str): 

288 """Return `True` if configuration indicates that the given dataset type 

289 should be converted. 

290 

291 This method is intended to be called primarily by the 

292 `RepoConverter` instances used interally by the task. 

293 

294 Parameters 

295 ---------- 

296 datasetTypeName: str 

297 Name of the dataset type. 

298 

299 Returns 

300 ------- 

301 included : `bool` 

302 Whether the dataset should be included in the conversion. 

303 """ 

304 return ( 

305 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

306 for pattern in self.config.datasetIncludePatterns) 

307 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

308 for pattern in self.config.datasetIgnorePatterns) 

309 ) 

310 

311 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str: 

312 """Indicate that a repository uses the given SkyMap. 

313 

314 This method is intended to be called primarily by the 

315 `RepoConverter` instances used interally by the task. 

316 

317 Parameters 

318 ---------- 

319 skyMap : `lsst.skymap.BaseSkyMap` 

320 SkyMap instance being used, typically retrieved from a Gen2 

321 data repository. 

322 skyMapName : `str` 

323 The name of the gen2 skymap, for error reporting. 

324 

325 Returns 

326 ------- 

327 name : `str` 

328 The name of the skymap in Gen3 data IDs. 

329 

330 Raises 

331 ------ 

332 LookupError 

333 Raised if the specified skymap cannot be found. 

334 """ 

335 sha1 = skyMap.getSha1() 

336 if sha1 not in self._configuredSkyMapsBySha1: 

337 self._populateSkyMapDicts(skyMapName, skyMap) 

338 try: 

339 struct = self._configuredSkyMapsBySha1[sha1] 

340 except KeyError as err: 

341 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration." 

342 raise LookupError(msg) from err 

343 struct.used = True 

344 return struct.name 

345 

346 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

347 """Register all skymaps that have been marked as used. 

348 

349 This method is intended to be called primarily by the 

350 `RepoConverter` instances used interally by the task. 

351 

352 Parameters 

353 ---------- 

354 subset : `ConversionSubset`, optional 

355 Object that will be used to filter converted datasets by data ID. 

356 If given, it will be updated with the tracts of this skymap that 

357 overlap the visits in the subset. 

358 """ 

359 for struct in self._configuredSkyMapsBySha1.values(): 

360 if struct.used: 

361 struct.instance.register(struct.name, self.registry) 

362 if subset is not None and self.config.relatedOnly: 

363 subset.addSkyMap(self.registry, struct.name) 

364 

365 def useSkyPix(self, dimension: SkyPixDimension): 

366 """Indicate that a repository uses the given SkyPix dimension. 

367 

368 This method is intended to be called primarily by the 

369 `RepoConverter` instances used interally by the task. 

370 

371 Parameters 

372 ---------- 

373 dimension : `lsst.daf.butler.SkyPixDimension` 

374 Dimension represening a pixelization of the sky. 

375 """ 

376 self._usedSkyPix.add(dimension) 

377 

378 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

379 """Register all skymaps that have been marked as used. 

380 

381 This method is intended to be called primarily by the 

382 `RepoConverter` instances used interally by the task. 

383 

384 Parameters 

385 ---------- 

386 subset : `ConversionSubset`, optional 

387 Object that will be used to filter converted datasets by data ID. 

388 If given, it will be updated with the pixelization IDs that 

389 overlap the visits in the subset. 

390 """ 

391 if subset is not None and self.config.relatedOnly: 

392 for dimension in self._usedSkyPix: 

393 subset.addSkyPix(self.registry, dimension) 

394 

395 def run(self, root: str, collections: List[str], *, 

396 calibs: Dict[str, List[str]] = None, 

397 reruns: Dict[str, List[str]] = None, 

398 visits: Optional[Iterable[int]] = None): 

399 """Convert a group of related data repositories. 

400 

401 Parameters 

402 ---------- 

403 root : `str` 

404 Complete path to the root Gen2 data repository. This should be 

405 a data repository that includes a Gen2 registry and any raw files 

406 and/or reference catalogs. 

407 collections : `list` of `str` 

408 Gen3 collections that datasets from the root repository should be 

409 associated with. This should include any rerun collection that 

410 these datasets should also be considered to be part of; because of 

411 structural difference between Gen2 parent/child relationships and 

412 Gen3 collections, these cannot be reliably inferred. 

413 calibs : `dict` 

414 Dictionary mapping calibration repository path to the collections 

415 that the repository's datasets should be associated with. The path 

416 may be relative to ``root`` or absolute. Collections should 

417 include child repository collections as appropriate (see 

418 documentation for ``collections``). 

419 reruns : `dict` 

420 Dictionary mapping rerun repository path to the collections that 

421 the repository's datasets should be associated with. The path may 

422 be relative to ``root`` or absolute. Collections should include 

423 child repository collections as appropriate (see documentation for 

424 ``collections``). 

425 visits : iterable of `int`, optional 

426 The integer IDs of visits to convert. If not provided, all visits 

427 in the Gen2 root repository will be converted. 

428 """ 

429 

430 if calibs is None: 

431 calibs = {} 

432 if reruns is None: 

433 reruns = {} 

434 if visits is not None: 

435 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

436 else: 

437 if self.config.relatedOnly: 

438 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

439 "no filtering will be done.") 

440 subset = None 

441 

442 # We can't wrap database writes sanely in transactions (yet) because we 

443 # keep initializing new Butler instances just so we can write into new 

444 # runs/collections, and transactions are managed at the Butler level. 

445 # DM-21246 should let us fix this, assuming we actually want to keep 

446 # the transaction open that long. 

447 if self.config.doRegisterInstrument: 

448 self.instrument.register(self.registry) 

449 

450 # Make and prep converters for all Gen2 repos. This should not modify 

451 # the Registry database or filesystem at all, though it may query it. 

452 # The prep() calls here will be some of the slowest ones, because 

453 # that's when we walk the filesystem. 

454 converters = [] 

455 rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset) 

456 rootConverter.prep() 

457 converters.append(rootConverter) 

458 

459 for root, collections in calibs.items(): 

460 if not os.path.isabs(root): 

461 root = os.path.join(rootConverter.root, root) 

462 converter = CalibRepoConverter(task=self, root=root, collections=collections, 

463 mapper=rootConverter.mapper, 

464 subset=rootConverter.subset) 

465 converter.prep() 

466 converters.append(converter) 

467 

468 for root, collections in reruns.items(): 

469 if not os.path.isabs(root): 

470 root = os.path.join(rootConverter.root, root) 

471 converter = StandardRepoConverter(task=self, root=root, collections=collections, 

472 subset=rootConverter.subset) 

473 converter.prep() 

474 converters.append(converter) 

475 

476 # Actual database writes start here. We can't wrap these sanely in 

477 # transactions (yet) because we keep initializing new Butler instances 

478 # just so we can write into new runs/collections, and transactions 

479 # are managed at the Butler level (DM-21246 should let us fix this). 

480 

481 # Insert dimensions needed by any converters. These are only the 

482 # dimensions that a converter expects to be uniquely derived from the 

483 # Gen2 repository it is reponsible for - e.g. visits, exposures, and 

484 # calibration_labels. 

485 # 

486 # Note that we do not try to filter dimensions down to just those 

487 # related to the given visits, even if config.relatedOnly is True; we 

488 # need them in the Gen3 repo in order to be able to know which datasets 

489 # to convert, because Gen2 alone doesn't know enough about the 

490 # relationships between data IDs. 

491 for converter in converters: 

492 converter.insertDimensionData() 

493 

494 # Insert dimensions that are potentially shared by all Gen2 

495 # repositories (and are hence managed directly by the Task, rather 

496 # than a converter instance). 

497 # This also finishes setting up the (shared) converter.subsets object 

498 # that is used to filter data IDs for config.relatedOnly. 

499 self.registerUsedSkyMaps(rootConverter.subset) 

500 self.registerUsedSkyPix(rootConverter.subset) 

501 

502 # Look for datasets, generally by scanning the filesystem. 

503 # This requires dimensions to have already been inserted so we can use 

504 # dimension information to identify related datasets. 

505 for converter in converters: 

506 converter.findDatasets() 

507 

508 # Expand data IDs. 

509 for converter in converters: 

510 converter.expandDataIds() 

511 

512 # Actually ingest datasets. 

513 for converter in converters: 

514 converter.ingest()