Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from typing import Iterable, Optional, List, Dict 

29 

30from lsst.utils import doImport 

31from lsst.daf.butler import ( 

32 Butler as Butler3, 

33 SkyPixDimension 

34) 

35from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

36from lsst.pipe.base import Task 

37from lsst.skymap import skyMapRegistry, BaseSkyMap 

38 

39from ..ingest import RawIngestTask 

40from .repoConverter import ConversionSubset 

41from .rootRepoConverter import RootRepoConverter 

42from .calibRepoConverter import CalibRepoConverter 

43from .standardRepoConverter import StandardRepoConverter 

44 

45 

46@dataclass 

47class ConfiguredSkyMap: 

48 """Struct containing information about a skymap that may appear in a Gen2 

49 repository. 

50 """ 

51 

52 name: str 

53 """Name of the skymap used in Gen3 data IDs. 

54 """ 

55 

56 sha1: bytes 

57 """Hash computed by `BaseSkyMap.getSha1`. 

58 """ 

59 

60 instance: BaseSkyMap 

61 """Name of the skymap used in Gen3 data IDs. 

62 """ 

63 

64 used: bool = False 

65 """Whether this skymap has been found in at least one repository being 

66 converted. 

67 """ 

68 

69 

70class ConvertRepoSkyMapConfig(Config): 

71 """Sub-config used to hold the parameters of a SkyMap. 

72 

73 Notes 

74 ----- 

75 This config only needs to exist because we can't put a 

76 `~lsst.pex.config.RegistryField` directly inside a 

77 `~lsst.pex.config.ConfigDictField`. 

78 

79 It needs to have its only field named "skyMap" for compatibility with the 

80 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

81 use one config file in an obs package to configure both. 

82 

83 This name leads to unfortunate repetition with the field named 

84 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

85 unavoidable. 

86 """ 

87 skyMap = skyMapRegistry.makeField( 

88 doc="Type and parameters for the SkyMap itself.", 

89 default="dodeca", 

90 ) 

91 

92 

93class ConvertRepoConfig(Config): 

94 raws = ConfigurableField( 

95 "Configuration for subtask responsible for ingesting raws and adding " 

96 "visit and exposure dimension entries.", 

97 target=RawIngestTask, 

98 ) 

99 skyMaps = ConfigDictField( 

100 "Mapping from Gen3 skymap name to the parameters used to construct a " 

101 "BaseSkyMap instance. This will be used to associate names with " 

102 "existing skymaps found in the Gen2 repo.", 

103 keytype=str, 

104 itemtype=ConvertRepoSkyMapConfig, 

105 default={} 

106 ) 

107 rootSkyMapName = Field( 

108 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

109 "datasets in the root repository when no SkyMap is found there. ", 

110 dtype=str, 

111 optional=True, 

112 default=None, 

113 ) 

114 collections = DictField( 

115 "Special collections (values) for certain dataset types (keys). " 

116 "These are used in addition to rerun collections for datasets in " 

117 "reruns. The 'raw' dataset must have an entry here if it is to be " 

118 "converted.", 

119 keytype=str, 

120 itemtype=str, 

121 default={ 

122 "deepCoadd_skyMap": "skymaps", 

123 "brightObjectMask": "masks", 

124 } 

125 ) 

126 storageClasses = DictField( 

127 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

128 "or 'persistable') to the Gen3 StorageClass name.", 

129 keytype=str, 

130 itemtype=str, 

131 default={ 

132 "bias": "ExposureF", 

133 "dark": "ExposureF", 

134 "flat": "ExposureF", 

135 "defects": "Defects", 

136 "BaseSkyMap": "SkyMap", 

137 "BaseCatalog": "Catalog", 

138 "BackgroundList": "Background", 

139 "raw": "Exposure", 

140 "MultilevelParquetTable": "DataFrame", 

141 "ParquetTable": "DataFrame", 

142 "SkyWcs": "Wcs", 

143 } 

144 ) 

145 formatterClasses = DictField( 

146 "Mapping from dataset type name to formatter class. " 

147 "By default these are derived from the formatters listed in the" 

148 " Gen3 datastore configuration.", 

149 keytype=str, 

150 itemtype=str, 

151 default={} 

152 ) 

153 targetHandlerClasses = DictField( 

154 "Mapping from dataset type name to target handler class.", 

155 keytype=str, 

156 itemtype=str, 

157 default={} 

158 ) 

159 doRegisterInstrument = Field( 

160 "If True (default), add dimension records for the Instrument and its " 

161 "filters and detectors to the registry instead of assuming they are " 

162 "already present.", 

163 dtype=bool, 

164 default=True, 

165 ) 

166 doWriteCuratedCalibrations = Field( 

167 "If True (default), ingest human-curated calibrations directly via " 

168 "the Instrument interface. Note that these calibrations are never " 

169 "converted from Gen2 repositories.", 

170 dtype=bool, 

171 default=True, 

172 ) 

173 refCats = ListField( 

174 "The names of reference catalogs (subdirectories under ref_cats) to " 

175 "be converted", 

176 dtype=str, 

177 default=[] 

178 ) 

179 fileIgnorePatterns = ListField( 

180 "Filename globs that should be ignored instead of being treated as " 

181 "datasets.", 

182 dtype=str, 

183 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

184 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

185 "_parent", "repositoryCfg.yaml"] 

186 ) 

187 rawDatasetType = Field( 

188 "Gen2 dataset type to use for raw data.", 

189 dtype=str, 

190 default="raw", 

191 ) 

192 datasetIncludePatterns = ListField( 

193 "Glob-style patterns for dataset type names that should be converted.", 

194 dtype=str, 

195 default=["*"] 

196 ) 

197 datasetIgnorePatterns = ListField( 

198 "Glob-style patterns for dataset type names that should not be " 

199 "converted despite matching a pattern in datasetIncludePatterns.", 

200 dtype=str, 

201 default=[] 

202 ) 

203 ccdKey = Field( 

204 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

205 dtype=str, 

206 default="ccd", 

207 ) 

208 relatedOnly = Field( 

209 "If True (default), only convert datasets that are related to the " 

210 "ingested visits. Ignored unless a list of visits is passed to " 

211 "run().", 

212 dtype=bool, 

213 default=False, 

214 ) 

215 curatedCalibrations = ListField( 

216 "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` " 

217 "and thus should not be converted using the standard calibration " 

218 "conversion system.", 

219 dtype=str, 

220 default=["camera", 

221 "transmission_sensor", 

222 "transmission_filter", 

223 "transmission_optics", 

224 "transmission_atmosphere", 

225 "bfKernel"] 

226 ) 

227 

228 @property 

229 def transfer(self): 

230 return self.raws.transfer 

231 

232 @transfer.setter 

233 def transfer(self, value): 

234 self.raws.transfer = value 

235 

236 @property 

237 def instrument(self): 

238 return self.raws.instrument 

239 

240 @instrument.setter 

241 def instrument(self, value): 

242 self.raws.instrument = value 

243 

244 def setDefaults(self): 

245 self.transfer = None 

246 

247 # TODO: check that there are no collection overrides for curated 

248 # calibrations, since we don't have a good way to utilize them. 

249 

250 

251class ConvertRepoTask(Task): 

252 """A task that converts one or more related Gen2 data repositories to a 

253 single Gen3 data repository (with multiple collections). 

254 

255 Parameters 

256 ---------- 

257 config: `ConvertRepoConfig` 

258 Configuration for this task. 

259 butler3: `lsst.daf.butler.Butler` 

260 Gen3 Butler instance that represents the data repository datasets will 

261 be ingested into. The collection and/or run associated with this 

262 Butler will be ignored in favor of collections/runs passed via config 

263 or to `run`. 

264 kwds 

265 Other keyword arguments are forwarded to the `Task` constructor. 

266 

267 Notes 

268 ----- 

269 Most of the work of converting repositories is delegated to instances of 

270 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

271 only state that is relevant for all Gen2 repositories being ingested, while 

272 each `RepoConverter` instance holds only state relevant for the conversion 

273 of a single Gen2 repository. Both the task and the `RepoConverter` 

274 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

275 methods may only be called once on a particular instance. 

276 """ 

277 

278 ConfigClass = ConvertRepoConfig 

279 

280 _DefaultName = "convertRepo" 

281 

282 def __init__(self, config=None, *, butler3: Butler3, **kwds): 

283 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

284 super().__init__(config, **kwds) 

285 self.butler3 = butler3 

286 self.registry = self.butler3.registry 

287 self.universe = self.registry.dimensions 

288 if self.isDatasetTypeIncluded("raw"): 

289 self.makeSubtask("raws", butler=butler3) 

290 self.instrument = self.raws.instrument 

291 else: 

292 self.raws = None 

293 self.instrument = doImport(self.config.instrument)() 

294 self._configuredSkyMapsBySha1 = {} 

295 self._configuredSkyMapsByName = {} 

296 for name, config in self.config.skyMaps.items(): 

297 instance = config.skyMap.apply() 

298 self._populateSkyMapDicts(name, instance) 

299 self._usedSkyPix = set() 

300 

301 def _populateSkyMapDicts(self, name, instance): 

302 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

303 self._configuredSkyMapsBySha1[struct.sha1] = struct 

304 self._configuredSkyMapsByName[struct.name] = struct 

305 

306 def isDatasetTypeIncluded(self, datasetTypeName: str): 

307 """Return `True` if configuration indicates that the given dataset type 

308 should be converted. 

309 

310 This method is intended to be called primarily by the 

311 `RepoConverter` instances used interally by the task. 

312 

313 Parameters 

314 ---------- 

315 datasetTypeName: str 

316 Name of the dataset type. 

317 

318 Returns 

319 ------- 

320 included : `bool` 

321 Whether the dataset should be included in the conversion. 

322 """ 

323 return ( 

324 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

325 for pattern in self.config.datasetIncludePatterns) 

326 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

327 for pattern in self.config.datasetIgnorePatterns) 

328 ) 

329 

330 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str: 

331 """Indicate that a repository uses the given SkyMap. 

332 

333 This method is intended to be called primarily by the 

334 `RepoConverter` instances used interally by the task. 

335 

336 Parameters 

337 ---------- 

338 skyMap : `lsst.skymap.BaseSkyMap` 

339 SkyMap instance being used, typically retrieved from a Gen2 

340 data repository. 

341 skyMapName : `str` 

342 The name of the gen2 skymap, for error reporting. 

343 

344 Returns 

345 ------- 

346 name : `str` 

347 The name of the skymap in Gen3 data IDs. 

348 

349 Raises 

350 ------ 

351 LookupError 

352 Raised if the specified skymap cannot be found. 

353 """ 

354 sha1 = skyMap.getSha1() 

355 if sha1 not in self._configuredSkyMapsBySha1: 

356 self._populateSkyMapDicts(skyMapName, skyMap) 

357 try: 

358 struct = self._configuredSkyMapsBySha1[sha1] 

359 except KeyError as err: 

360 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration." 

361 raise LookupError(msg) from err 

362 struct.used = True 

363 return struct.name 

364 

365 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

366 """Register all skymaps that have been marked as used. 

367 

368 This method is intended to be called primarily by the 

369 `RepoConverter` instances used interally by the task. 

370 

371 Parameters 

372 ---------- 

373 subset : `ConversionSubset`, optional 

374 Object that will be used to filter converted datasets by data ID. 

375 If given, it will be updated with the tracts of this skymap that 

376 overlap the visits in the subset. 

377 """ 

378 for struct in self._configuredSkyMapsBySha1.values(): 

379 if struct.used: 

380 struct.instance.register(struct.name, self.registry) 

381 if subset is not None and self.config.relatedOnly: 

382 subset.addSkyMap(self.registry, struct.name) 

383 

384 def useSkyPix(self, dimension: SkyPixDimension): 

385 """Indicate that a repository uses the given SkyPix dimension. 

386 

387 This method is intended to be called primarily by the 

388 `RepoConverter` instances used interally by the task. 

389 

390 Parameters 

391 ---------- 

392 dimension : `lsst.daf.butler.SkyPixDimension` 

393 Dimension represening a pixelization of the sky. 

394 """ 

395 self._usedSkyPix.add(dimension) 

396 

397 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

398 """Register all skymaps that have been marked as used. 

399 

400 This method is intended to be called primarily by the 

401 `RepoConverter` instances used interally by the task. 

402 

403 Parameters 

404 ---------- 

405 subset : `ConversionSubset`, optional 

406 Object that will be used to filter converted datasets by data ID. 

407 If given, it will be updated with the pixelization IDs that 

408 overlap the visits in the subset. 

409 """ 

410 if subset is not None and self.config.relatedOnly: 

411 for dimension in self._usedSkyPix: 

412 subset.addSkyPix(self.registry, dimension) 

413 

414 def run(self, root: str, collections: List[str], *, 

415 calibs: Dict[str, List[str]] = None, 

416 reruns: Dict[str, List[str]] = None, 

417 visits: Optional[Iterable[int]] = None): 

418 """Convert a group of related data repositories. 

419 

420 Parameters 

421 ---------- 

422 root : `str` 

423 Complete path to the root Gen2 data repository. This should be 

424 a data repository that includes a Gen2 registry and any raw files 

425 and/or reference catalogs. 

426 collections : `list` of `str` 

427 Gen3 collections that datasets from the root repository should be 

428 associated with. This should include any rerun collection that 

429 these datasets should also be considered to be part of; because of 

430 structural difference between Gen2 parent/child relationships and 

431 Gen3 collections, these cannot be reliably inferred. 

432 calibs : `dict` 

433 Dictionary mapping calibration repository path to the collections 

434 that the repository's datasets should be associated with. The path 

435 may be relative to ``root`` or absolute. Collections should 

436 include child repository collections as appropriate (see 

437 documentation for ``collections``). 

438 reruns : `dict` 

439 Dictionary mapping rerun repository path to the collections that 

440 the repository's datasets should be associated with. The path may 

441 be relative to ``root`` or absolute. Collections should include 

442 child repository collections as appropriate (see documentation for 

443 ``collections``). 

444 visits : iterable of `int`, optional 

445 The integer IDs of visits to convert. If not provided, all visits 

446 in the Gen2 root repository will be converted. 

447 """ 

448 

449 if calibs is None: 

450 calibs = {} 

451 if reruns is None: 

452 reruns = {} 

453 if visits is not None: 

454 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

455 else: 

456 if self.config.relatedOnly: 

457 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

458 "no filtering will be done.") 

459 subset = None 

460 

461 # We can't wrap database writes sanely in transactions (yet) because we 

462 # keep initializing new Butler instances just so we can write into new 

463 # runs/collections, and transactions are managed at the Butler level. 

464 # DM-21246 should let us fix this, assuming we actually want to keep 

465 # the transaction open that long. 

466 if self.config.doRegisterInstrument: 

467 # Allow registration to fail on the assumption that this means 

468 # we are reusing a butler 

469 try: 

470 self.instrument.register(self.registry) 

471 except Exception: 

472 pass 

473 

474 # Make and prep converters for all Gen2 repos. This should not modify 

475 # the Registry database or filesystem at all, though it may query it. 

476 # The prep() calls here will be some of the slowest ones, because 

477 # that's when we walk the filesystem. 

478 converters = [] 

479 rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset) 

480 rootConverter.prep() 

481 converters.append(rootConverter) 

482 

483 for root, collections in calibs.items(): 

484 if not os.path.isabs(root): 

485 root = os.path.join(rootConverter.root, root) 

486 converter = CalibRepoConverter(task=self, root=root, collections=collections, 

487 mapper=rootConverter.mapper, 

488 subset=rootConverter.subset) 

489 converter.prep() 

490 converters.append(converter) 

491 

492 for root, collections in reruns.items(): 

493 if not os.path.isabs(root): 

494 root = os.path.join(rootConverter.root, root) 

495 converter = StandardRepoConverter(task=self, root=root, collections=collections, 

496 subset=rootConverter.subset) 

497 converter.prep() 

498 converters.append(converter) 

499 

500 # Actual database writes start here. We can't wrap these sanely in 

501 # transactions (yet) because we keep initializing new Butler instances 

502 # just so we can write into new runs/collections, and transactions 

503 # are managed at the Butler level (DM-21246 should let us fix this). 

504 

505 # Insert dimensions needed by any converters. These are only the 

506 # dimensions that a converter expects to be uniquely derived from the 

507 # Gen2 repository it is reponsible for - e.g. visits, exposures, and 

508 # calibration_labels. 

509 # 

510 # Note that we do not try to filter dimensions down to just those 

511 # related to the given visits, even if config.relatedOnly is True; we 

512 # need them in the Gen3 repo in order to be able to know which datasets 

513 # to convert, because Gen2 alone doesn't know enough about the 

514 # relationships between data IDs. 

515 for converter in converters: 

516 try: 

517 converter.insertDimensionData() 

518 except Exception: 

519 pass 

520 

521 # Insert dimensions that are potentially shared by all Gen2 

522 # repositories (and are hence managed directly by the Task, rather 

523 # than a converter instance). 

524 # This also finishes setting up the (shared) converter.subsets object 

525 # that is used to filter data IDs for config.relatedOnly. 

526 self.registerUsedSkyMaps(rootConverter.subset) 

527 self.registerUsedSkyPix(rootConverter.subset) 

528 

529 # Look for datasets, generally by scanning the filesystem. 

530 # This requires dimensions to have already been inserted so we can use 

531 # dimension information to identify related datasets. 

532 for converter in converters: 

533 converter.findDatasets() 

534 

535 # Expand data IDs. 

536 for converter in converters: 

537 converter.expandDataIds() 

538 

539 # Actually ingest datasets. 

540 for converter in converters: 

541 converter.ingest()