Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from typing import Iterable, Optional, List, Dict 

29 

30from lsst.utils import doImport 

31from lsst.daf.butler import ( 

32 Butler as Butler3, 

33 CollectionType, 

34 SkyPixDimension 

35) 

36from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

37from lsst.pipe.base import Task 

38from lsst.skymap import skyMapRegistry, BaseSkyMap 

39 

40from ..ingest import RawIngestTask 

41from .repoConverter import ConversionSubset 

42from .rootRepoConverter import RootRepoConverter 

43from .calibRepoConverter import CalibRepoConverter 

44from .standardRepoConverter import StandardRepoConverter 

45 

46 

47@dataclass 

48class ConfiguredSkyMap: 

49 """Struct containing information about a skymap that may appear in a Gen2 

50 repository. 

51 """ 

52 

53 name: str 

54 """Name of the skymap used in Gen3 data IDs. 

55 """ 

56 

57 sha1: bytes 

58 """Hash computed by `BaseSkyMap.getSha1`. 

59 """ 

60 

61 instance: BaseSkyMap 

62 """Name of the skymap used in Gen3 data IDs. 

63 """ 

64 

65 used: bool = False 

66 """Whether this skymap has been found in at least one repository being 

67 converted. 

68 """ 

69 

70 

71@dataclass 

72class Rerun: 

73 """Specification for a Gen2 processing-output repository to convert. 

74 """ 

75 

76 path: str 

77 """Absolute or relative (to the root repository) path to the Gen2 

78 repository (`str`). 

79 """ 

80 

81 runName: str 

82 """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets 

83 will be inserted into (`str`). 

84 """ 

85 

86 chainName: Optional[str] 

87 """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will 

88 combine this repository's datasets with those of its parent repositories 

89 (`str`, optional). 

90 """ 

91 

92 parents: List[str] 

93 """Collection names associated with parent repositories, used to define the 

94 chained collection (`list` [ `str` ]). 

95 

96 Ignored if `chainName` is `None`. Runs used in the root repo are 

97 automatically included. 

98 """ 

99 

100 

101class ConvertRepoSkyMapConfig(Config): 

102 """Sub-config used to hold the parameters of a SkyMap. 

103 

104 Notes 

105 ----- 

106 This config only needs to exist because we can't put a 

107 `~lsst.pex.config.RegistryField` directly inside a 

108 `~lsst.pex.config.ConfigDictField`. 

109 

110 It needs to have its only field named "skyMap" for compatibility with the 

111 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

112 use one config file in an obs package to configure both. 

113 

114 This name leads to unfortunate repetition with the field named 

115 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

116 unavoidable. 

117 """ 

118 skyMap = skyMapRegistry.makeField( 

119 doc="Type and parameters for the SkyMap itself.", 

120 default="dodeca", 

121 ) 

122 

123 

124class ConvertRepoConfig(Config): 

125 raws = ConfigurableField( 

126 "Configuration for subtask responsible for ingesting raws and adding " 

127 "visit and exposure dimension entries.", 

128 target=RawIngestTask, 

129 ) 

130 skyMaps = ConfigDictField( 

131 "Mapping from Gen3 skymap name to the parameters used to construct a " 

132 "BaseSkyMap instance. This will be used to associate names with " 

133 "existing skymaps found in the Gen2 repo.", 

134 keytype=str, 

135 itemtype=ConvertRepoSkyMapConfig, 

136 default={} 

137 ) 

138 rootSkyMapName = Field( 

139 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

140 "datasets in the root repository when no SkyMap is found there. ", 

141 dtype=str, 

142 optional=True, 

143 default=None, 

144 ) 

145 runs = DictField( 

146 "A mapping from dataset type name to the RUN collection they should " 

147 "be inserted into. This must include all datasets that can be found " 

148 "in the root repository; other repositories will use per-repository " 

149 "runs.", 

150 keytype=str, 

151 itemtype=str, 

152 default={ 

153 "deepCoadd_skyMap": "skymaps", 

154 "brightObjectMask": "masks", 

155 } 

156 ) 

157 storageClasses = DictField( 

158 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

159 "or 'persistable') to the Gen3 StorageClass name.", 

160 keytype=str, 

161 itemtype=str, 

162 default={ 

163 "bias": "ExposureF", 

164 "dark": "ExposureF", 

165 "flat": "ExposureF", 

166 "defects": "Defects", 

167 "BaseSkyMap": "SkyMap", 

168 "BaseCatalog": "Catalog", 

169 "BackgroundList": "Background", 

170 "raw": "Exposure", 

171 "MultilevelParquetTable": "DataFrame", 

172 "ParquetTable": "DataFrame", 

173 "SkyWcs": "Wcs", 

174 } 

175 ) 

176 formatterClasses = DictField( 

177 "Mapping from dataset type name to formatter class. " 

178 "By default these are derived from the formatters listed in the" 

179 " Gen3 datastore configuration.", 

180 keytype=str, 

181 itemtype=str, 

182 default={} 

183 ) 

184 targetHandlerClasses = DictField( 

185 "Mapping from dataset type name to target handler class.", 

186 keytype=str, 

187 itemtype=str, 

188 default={} 

189 ) 

190 doRegisterInstrument = Field( 

191 "If True (default), add dimension records for the Instrument and its " 

192 "filters and detectors to the registry instead of assuming they are " 

193 "already present.", 

194 dtype=bool, 

195 default=True, 

196 ) 

197 doWriteCuratedCalibrations = Field( 

198 "If True (default), ingest human-curated calibrations directly via " 

199 "the Instrument interface. Note that these calibrations are never " 

200 "converted from Gen2 repositories.", 

201 dtype=bool, 

202 default=True, 

203 ) 

204 refCats = ListField( 

205 "The names of reference catalogs (subdirectories under ref_cats) to " 

206 "be converted", 

207 dtype=str, 

208 default=[] 

209 ) 

210 fileIgnorePatterns = ListField( 

211 "Filename globs that should be ignored instead of being treated as " 

212 "datasets.", 

213 dtype=str, 

214 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

215 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

216 "_parent", "repositoryCfg.yaml"] 

217 ) 

218 rawDatasetType = Field( 

219 "Gen2 dataset type to use for raw data.", 

220 dtype=str, 

221 default="raw", 

222 ) 

223 datasetIncludePatterns = ListField( 

224 "Glob-style patterns for dataset type names that should be converted.", 

225 dtype=str, 

226 default=["*"] 

227 ) 

228 datasetIgnorePatterns = ListField( 

229 "Glob-style patterns for dataset type names that should not be " 

230 "converted despite matching a pattern in datasetIncludePatterns.", 

231 dtype=str, 

232 default=[] 

233 ) 

234 ccdKey = Field( 

235 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

236 dtype=str, 

237 default="ccd", 

238 ) 

239 relatedOnly = Field( 

240 "If True (default), only convert datasets that are related to the " 

241 "ingested visits. Ignored unless a list of visits is passed to " 

242 "run().", 

243 dtype=bool, 

244 default=False, 

245 ) 

246 curatedCalibrations = ListField( 

247 "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` " 

248 "and thus should not be converted using the standard calibration " 

249 "conversion system.", 

250 dtype=str, 

251 default=["camera", 

252 "transmission_sensor", 

253 "transmission_filter", 

254 "transmission_optics", 

255 "transmission_atmosphere", 

256 "bfKernel"] 

257 ) 

258 

259 @property 

260 def transfer(self): 

261 return self.raws.transfer 

262 

263 @transfer.setter 

264 def transfer(self, value): 

265 self.raws.transfer = value 

266 

267 @property 

268 def instrument(self): 

269 return self.raws.instrument 

270 

271 @instrument.setter 

272 def instrument(self, value): 

273 self.raws.instrument = value 

274 

275 def setDefaults(self): 

276 self.transfer = None 

277 

278 # TODO: check that there are no collection overrides for curated 

279 # calibrations, since we don't have a good way to utilize them. 

280 

281 

282class ConvertRepoTask(Task): 

283 """A task that converts one or more related Gen2 data repositories to a 

284 single Gen3 data repository (with multiple collections). 

285 

286 Parameters 

287 ---------- 

288 config: `ConvertRepoConfig` 

289 Configuration for this task. 

290 butler3: `lsst.daf.butler.Butler` 

291 Gen3 Butler instance that represents the data repository datasets will 

292 be ingested into. The collection and/or run associated with this 

293 Butler will be ignored in favor of collections/runs passed via config 

294 or to `run`. 

295 kwds 

296 Other keyword arguments are forwarded to the `Task` constructor. 

297 

298 Notes 

299 ----- 

300 Most of the work of converting repositories is delegated to instances of 

301 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

302 only state that is relevant for all Gen2 repositories being ingested, while 

303 each `RepoConverter` instance holds only state relevant for the conversion 

304 of a single Gen2 repository. Both the task and the `RepoConverter` 

305 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

306 methods may only be called once on a particular instance. 

307 """ 

308 

309 ConfigClass = ConvertRepoConfig 

310 

311 _DefaultName = "convertRepo" 

312 

313 def __init__(self, config=None, *, butler3: Butler3, **kwds): 

314 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

315 super().__init__(config, **kwds) 

316 self.butler3 = butler3 

317 self.registry = self.butler3.registry 

318 self.universe = self.registry.dimensions 

319 if self.isDatasetTypeIncluded("raw"): 

320 self.makeSubtask("raws", butler=butler3) 

321 self.instrument = self.raws.instrument 

322 else: 

323 self.raws = None 

324 self.instrument = doImport(self.config.instrument)() 

325 self._configuredSkyMapsBySha1 = {} 

326 self._configuredSkyMapsByName = {} 

327 for name, config in self.config.skyMaps.items(): 

328 instance = config.skyMap.apply() 

329 self._populateSkyMapDicts(name, instance) 

330 self._usedSkyPix = set() 

331 

332 def _populateSkyMapDicts(self, name, instance): 

333 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

334 self._configuredSkyMapsBySha1[struct.sha1] = struct 

335 self._configuredSkyMapsByName[struct.name] = struct 

336 

337 def isDatasetTypeIncluded(self, datasetTypeName: str): 

338 """Return `True` if configuration indicates that the given dataset type 

339 should be converted. 

340 

341 This method is intended to be called primarily by the 

342 `RepoConverter` instances used interally by the task. 

343 

344 Parameters 

345 ---------- 

346 datasetTypeName: str 

347 Name of the dataset type. 

348 

349 Returns 

350 ------- 

351 included : `bool` 

352 Whether the dataset should be included in the conversion. 

353 """ 

354 return ( 

355 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

356 for pattern in self.config.datasetIncludePatterns) 

357 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

358 for pattern in self.config.datasetIgnorePatterns) 

359 ) 

360 

361 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str: 

362 """Indicate that a repository uses the given SkyMap. 

363 

364 This method is intended to be called primarily by the 

365 `RepoConverter` instances used interally by the task. 

366 

367 Parameters 

368 ---------- 

369 skyMap : `lsst.skymap.BaseSkyMap` 

370 SkyMap instance being used, typically retrieved from a Gen2 

371 data repository. 

372 skyMapName : `str` 

373 The name of the gen2 skymap, for error reporting. 

374 

375 Returns 

376 ------- 

377 name : `str` 

378 The name of the skymap in Gen3 data IDs. 

379 

380 Raises 

381 ------ 

382 LookupError 

383 Raised if the specified skymap cannot be found. 

384 """ 

385 sha1 = skyMap.getSha1() 

386 if sha1 not in self._configuredSkyMapsBySha1: 

387 self._populateSkyMapDicts(skyMapName, skyMap) 

388 try: 

389 struct = self._configuredSkyMapsBySha1[sha1] 

390 except KeyError as err: 

391 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration." 

392 raise LookupError(msg) from err 

393 struct.used = True 

394 return struct.name 

395 

396 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

397 """Register all skymaps that have been marked as used. 

398 

399 This method is intended to be called primarily by the 

400 `RepoConverter` instances used interally by the task. 

401 

402 Parameters 

403 ---------- 

404 subset : `ConversionSubset`, optional 

405 Object that will be used to filter converted datasets by data ID. 

406 If given, it will be updated with the tracts of this skymap that 

407 overlap the visits in the subset. 

408 """ 

409 for struct in self._configuredSkyMapsBySha1.values(): 

410 if struct.used: 

411 struct.instance.register(struct.name, self.registry) 

412 if subset is not None and self.config.relatedOnly: 

413 subset.addSkyMap(self.registry, struct.name) 

414 

415 def useSkyPix(self, dimension: SkyPixDimension): 

416 """Indicate that a repository uses the given SkyPix dimension. 

417 

418 This method is intended to be called primarily by the 

419 `RepoConverter` instances used interally by the task. 

420 

421 Parameters 

422 ---------- 

423 dimension : `lsst.daf.butler.SkyPixDimension` 

424 Dimension represening a pixelization of the sky. 

425 """ 

426 self._usedSkyPix.add(dimension) 

427 

428 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

429 """Register all skymaps that have been marked as used. 

430 

431 This method is intended to be called primarily by the 

432 `RepoConverter` instances used interally by the task. 

433 

434 Parameters 

435 ---------- 

436 subset : `ConversionSubset`, optional 

437 Object that will be used to filter converted datasets by data ID. 

438 If given, it will be updated with the pixelization IDs that 

439 overlap the visits in the subset. 

440 """ 

441 if subset is not None and self.config.relatedOnly: 

442 for dimension in self._usedSkyPix: 

443 subset.addSkyPix(self.registry, dimension) 

444 

445 def run(self, root: str, *, 

446 calibs: Dict[str, str] = None, 

447 reruns: List[Rerun], 

448 visits: Optional[Iterable[int]] = None): 

449 """Convert a group of related data repositories. 

450 

451 Parameters 

452 ---------- 

453 root : `str` 

454 Complete path to the root Gen2 data repository. This should be 

455 a data repository that includes a Gen2 registry and any raw files 

456 and/or reference catalogs. 

457 calibs : `dict` 

458 Dictionary mapping calibration repository path to the 

459 `~lsst.daf.butler.CollectionType.RUN` collection that converted 

460 datasets within it should be inserted into. 

461 reruns : `list` of `Rerun` 

462 Specifications for rerun (processing output) collections to 

463 convert. 

464 visits : iterable of `int`, optional 

465 The integer IDs of visits to convert. If not provided, all visits 

466 in the Gen2 root repository will be converted. 

467 """ 

468 if calibs is None: 

469 calibs = {} 

470 if visits is not None: 

471 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

472 else: 

473 if self.config.relatedOnly: 

474 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

475 "no filtering will be done.") 

476 subset = None 

477 

478 # We can't wrap database writes sanely in transactions (yet) because we 

479 # keep initializing new Butler instances just so we can write into new 

480 # runs/collections, and transactions are managed at the Butler level. 

481 # DM-21246 should let us fix this, assuming we actually want to keep 

482 # the transaction open that long. 

483 if self.config.doRegisterInstrument: 

484 # Allow registration to fail on the assumption that this means 

485 # we are reusing a butler 

486 try: 

487 self.instrument.register(self.registry) 

488 except Exception: 

489 pass 

490 

491 # Make and prep converters for all Gen2 repos. This should not modify 

492 # the Registry database or filesystem at all, though it may query it. 

493 # The prep() calls here will be some of the slowest ones, because 

494 # that's when we walk the filesystem. 

495 converters = [] 

496 rootConverter = RootRepoConverter(task=self, root=root, subset=subset) 

497 rootConverter.prep() 

498 converters.append(rootConverter) 

499 

500 for calibRoot, run in calibs.items(): 

501 if not os.path.isabs(calibRoot): 

502 calibRoot = os.path.join(rootConverter.root, calibRoot) 

503 converter = CalibRepoConverter(task=self, root=calibRoot, run=run, 

504 mapper=rootConverter.mapper, 

505 subset=rootConverter.subset) 

506 converter.prep() 

507 converters.append(converter) 

508 

509 for spec in reruns: 

510 runRoot = spec.path 

511 if not os.path.isabs(runRoot): 

512 runRoot = os.path.join(rootConverter.root, runRoot) 

513 converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName, 

514 subset=rootConverter.subset) 

515 converter.prep() 

516 converters.append(converter) 

517 

518 # Actual database writes start here. We can't wrap these sanely in 

519 # transactions (yet) because we keep initializing new Butler instances 

520 # just so we can write into new runs/collections, and transactions 

521 # are managed at the Butler level (DM-21246 should let us fix this). 

522 

523 # Insert dimensions needed by any converters. These are only the 

524 # dimensions that a converter expects to be uniquely derived from the 

525 # Gen2 repository it is reponsible for - e.g. visits, exposures, and 

526 # calibration_labels. 

527 # 

528 # Note that we do not try to filter dimensions down to just those 

529 # related to the given visits, even if config.relatedOnly is True; we 

530 # need them in the Gen3 repo in order to be able to know which datasets 

531 # to convert, because Gen2 alone doesn't know enough about the 

532 # relationships between data IDs. 

533 for converter in converters: 

534 try: 

535 converter.insertDimensionData() 

536 except Exception: 

537 pass 

538 

539 # Insert dimensions that are potentially shared by all Gen2 

540 # repositories (and are hence managed directly by the Task, rather 

541 # than a converter instance). 

542 # This also finishes setting up the (shared) converter.subsets object 

543 # that is used to filter data IDs for config.relatedOnly. 

544 self.registerUsedSkyMaps(rootConverter.subset) 

545 self.registerUsedSkyPix(rootConverter.subset) 

546 

547 # Look for datasets, generally by scanning the filesystem. 

548 # This requires dimensions to have already been inserted so we can use 

549 # dimension information to identify related datasets. 

550 for converter in converters: 

551 converter.findDatasets() 

552 

553 # Expand data IDs. 

554 for converter in converters: 

555 converter.expandDataIds() 

556 

557 # Actually ingest datasets. 

558 for converter in converters: 

559 converter.ingest() 

560 

561 # Add chained collections for reruns. 

562 for spec in reruns: 

563 if spec.chainName is not None: 

564 self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED) 

565 chain = [spec.runName] 

566 chain.extend(spec.parents) 

567 chain.extend(rootConverter.getCollectionChain()) 

568 self.log.info("Defining %s from chain %s.", spec.chainName, chain) 

569 self.butler3.registry.setCollectionChain(spec.chainName, chain)