Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from typing import Iterable, Optional, List, Dict 

29 

30from lsst.utils import doImport 

31from lsst.daf.butler import ( 

32 Butler as Butler3, 

33 CollectionType, 

34 SkyPixDimension 

35) 

36from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

37from lsst.pipe.base import Task 

38from lsst.skymap import skyMapRegistry, BaseSkyMap 

39 

40from ..ingest import RawIngestTask 

41from ..defineVisits import DefineVisitsTask 

42from .repoConverter import ConversionSubset 

43from .rootRepoConverter import RootRepoConverter 

44from .calibRepoConverter import CalibRepoConverter 

45from .standardRepoConverter import StandardRepoConverter 

46 

47 

48@dataclass 

49class ConfiguredSkyMap: 

50 """Struct containing information about a skymap that may appear in a Gen2 

51 repository. 

52 """ 

53 

54 name: str 

55 """Name of the skymap used in Gen3 data IDs. 

56 """ 

57 

58 sha1: bytes 

59 """Hash computed by `BaseSkyMap.getSha1`. 

60 """ 

61 

62 instance: BaseSkyMap 

63 """Name of the skymap used in Gen3 data IDs. 

64 """ 

65 

66 used: bool = False 

67 """Whether this skymap has been found in at least one repository being 

68 converted. 

69 """ 

70 

71 

72@dataclass 

73class Rerun: 

74 """Specification for a Gen2 processing-output repository to convert. 

75 """ 

76 

77 path: str 

78 """Absolute or relative (to the root repository) path to the Gen2 

79 repository (`str`). 

80 """ 

81 

82 runName: str 

83 """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets 

84 will be inserted into (`str`). 

85 """ 

86 

87 chainName: Optional[str] 

88 """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will 

89 combine this repository's datasets with those of its parent repositories 

90 (`str`, optional). 

91 """ 

92 

93 parents: List[str] 

94 """Collection names associated with parent repositories, used to define the 

95 chained collection (`list` [ `str` ]). 

96 

97 Ignored if `chainName` is `None`. Runs used in the root repo are 

98 automatically included. 

99 """ 

100 

101 

102class ConvertRepoSkyMapConfig(Config): 

103 """Sub-config used to hold the parameters of a SkyMap. 

104 

105 Notes 

106 ----- 

107 This config only needs to exist because we can't put a 

108 `~lsst.pex.config.RegistryField` directly inside a 

109 `~lsst.pex.config.ConfigDictField`. 

110 

111 It needs to have its only field named "skyMap" for compatibility with the 

112 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

113 use one config file in an obs package to configure both. 

114 

115 This name leads to unfortunate repetition with the field named 

116 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

117 unavoidable. 

118 """ 

119 skyMap = skyMapRegistry.makeField( 

120 doc="Type and parameters for the SkyMap itself.", 

121 default="dodeca", 

122 ) 

123 

124 

125class ConvertRepoConfig(Config): 

126 raws = ConfigurableField( 

127 "Configuration for subtask responsible for ingesting raws and adding " 

128 "exposure dimension entries.", 

129 target=RawIngestTask, 

130 ) 

131 defineVisits = ConfigurableField( 

132 "Configuration for the subtask responsible for defining visits from " 

133 "exposures.", 

134 target=DefineVisitsTask, 

135 ) 

136 skyMaps = ConfigDictField( 

137 "Mapping from Gen3 skymap name to the parameters used to construct a " 

138 "BaseSkyMap instance. This will be used to associate names with " 

139 "existing skymaps found in the Gen2 repo.", 

140 keytype=str, 

141 itemtype=ConvertRepoSkyMapConfig, 

142 default={} 

143 ) 

144 rootSkyMapName = Field( 

145 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

146 "datasets in the root repository when no SkyMap is found there. ", 

147 dtype=str, 

148 optional=True, 

149 default=None, 

150 ) 

151 runs = DictField( 

152 "A mapping from dataset type name to the RUN collection they should " 

153 "be inserted into. This must include all datasets that can be found " 

154 "in the root repository; other repositories will use per-repository " 

155 "runs.", 

156 keytype=str, 

157 itemtype=str, 

158 default={ 

159 "deepCoadd_skyMap": "skymaps", 

160 "brightObjectMask": "masks", 

161 } 

162 ) 

163 storageClasses = DictField( 

164 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

165 "or 'persistable') to the Gen3 StorageClass name.", 

166 keytype=str, 

167 itemtype=str, 

168 default={ 

169 "bias": "ExposureF", 

170 "dark": "ExposureF", 

171 "flat": "ExposureF", 

172 "defects": "Defects", 

173 "crosstalk": "CrosstalkCalib", 

174 "BaseSkyMap": "SkyMap", 

175 "BaseCatalog": "Catalog", 

176 "BackgroundList": "Background", 

177 "raw": "Exposure", 

178 "MultilevelParquetTable": "DataFrame", 

179 "ParquetTable": "DataFrame", 

180 "SkyWcs": "Wcs", 

181 } 

182 ) 

183 formatterClasses = DictField( 

184 "Mapping from dataset type name to formatter class. " 

185 "By default these are derived from the formatters listed in the" 

186 " Gen3 datastore configuration.", 

187 keytype=str, 

188 itemtype=str, 

189 default={} 

190 ) 

191 targetHandlerClasses = DictField( 

192 "Mapping from dataset type name to target handler class.", 

193 keytype=str, 

194 itemtype=str, 

195 default={} 

196 ) 

197 doRegisterInstrument = Field( 

198 "If True (default), add dimension records for the Instrument and its " 

199 "filters and detectors to the registry instead of assuming they are " 

200 "already present.", 

201 dtype=bool, 

202 default=True, 

203 ) 

204 doWriteCuratedCalibrations = Field( 

205 "If True (default), ingest human-curated calibrations directly via " 

206 "the Instrument interface. Note that these calibrations are never " 

207 "converted from Gen2 repositories.", 

208 dtype=bool, 

209 default=True, 

210 ) 

211 refCats = ListField( 

212 "The names of reference catalogs (subdirectories under ref_cats) to " 

213 "be converted", 

214 dtype=str, 

215 default=[] 

216 ) 

217 fileIgnorePatterns = ListField( 

218 "Filename globs that should be ignored instead of being treated as " 

219 "datasets.", 

220 dtype=str, 

221 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

222 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

223 "_parent", "repositoryCfg.yaml"] 

224 ) 

225 rawDatasetType = Field( 

226 "Gen2 dataset type to use for raw data.", 

227 dtype=str, 

228 default="raw", 

229 ) 

230 datasetIncludePatterns = ListField( 

231 "Glob-style patterns for dataset type names that should be converted.", 

232 dtype=str, 

233 default=["*"] 

234 ) 

235 datasetIgnorePatterns = ListField( 

236 "Glob-style patterns for dataset type names that should not be " 

237 "converted despite matching a pattern in datasetIncludePatterns.", 

238 dtype=str, 

239 default=[] 

240 ) 

241 ccdKey = Field( 

242 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

243 dtype=str, 

244 default="ccd", 

245 ) 

246 relatedOnly = Field( 

247 "If True (default), only convert datasets that are related to the " 

248 "ingested visits. Ignored unless a list of visits is passed to " 

249 "run().", 

250 dtype=bool, 

251 default=False, 

252 ) 

253 curatedCalibrations = ListField( 

254 "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` " 

255 "and thus should not be converted using the standard calibration " 

256 "conversion system.", 

257 dtype=str, 

258 default=["camera", 

259 "transmission_sensor", 

260 "transmission_filter", 

261 "transmission_optics", 

262 "transmission_atmosphere", 

263 "bfKernel"] 

264 ) 

265 instrument = Field( 

266 doc=("Fully-qualified Python name of the `Instrument` subclass for " 

267 "all converted datasets."), 

268 dtype=str, 

269 optional=False, 

270 default=None, 

271 ) 

272 

273 @property 

274 def transfer(self): 

275 return self.raws.transfer 

276 

277 @transfer.setter 

278 def transfer(self, value): 

279 self.raws.transfer = value 

280 

281 def setDefaults(self): 

282 self.transfer = None 

283 

284 # TODO: check that there are no collection overrides for curated 

285 # calibrations, since we don't have a good way to utilize them. 

286 

287 

288class ConvertRepoTask(Task): 

289 """A task that converts one or more related Gen2 data repositories to a 

290 single Gen3 data repository (with multiple collections). 

291 

292 Parameters 

293 ---------- 

294 config: `ConvertRepoConfig` 

295 Configuration for this task. 

296 butler3: `lsst.daf.butler.Butler` 

297 A writeable Gen3 Butler instance that represents the data repository 

298 that datasets will be ingested into. If the 'raw' dataset is 

299 configured to be included in the conversion, ``butler3.run`` should be 

300 set to the name of the collection raws should be ingested into, and 

301 ``butler3.collections`` should include a calibration collection from 

302 which the ``camera`` dataset can be loaded, unless a calibration repo 

303 is converted and ``doWriteCuratedCalibrations`` is `True`. 

304 **kwargs 

305 Other keyword arguments are forwarded to the `Task` constructor. 

306 

307 Notes 

308 ----- 

309 Most of the work of converting repositories is delegated to instances of 

310 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

311 only state that is relevant for all Gen2 repositories being ingested, while 

312 each `RepoConverter` instance holds only state relevant for the conversion 

313 of a single Gen2 repository. Both the task and the `RepoConverter` 

314 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

315 methods may only be called once on a particular instance. 

316 """ 

317 

318 ConfigClass = ConvertRepoConfig 

319 

320 _DefaultName = "convertRepo" 

321 

322 def __init__(self, config=None, *, butler3: Butler3, **kwargs): 

323 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

324 super().__init__(config, **kwargs) 

325 self.butler3 = butler3 

326 self.registry = self.butler3.registry 

327 self.universe = self.registry.dimensions 

328 if self.isDatasetTypeIncluded("raw"): 

329 self.makeSubtask("raws", butler=butler3) 

330 self.makeSubtask("defineVisits", butler=butler3) 

331 else: 

332 self.raws = None 

333 self.defineVisits = None 

334 self.instrument = doImport(self.config.instrument)() 

335 self._configuredSkyMapsBySha1 = {} 

336 self._configuredSkyMapsByName = {} 

337 for name, config in self.config.skyMaps.items(): 

338 instance = config.skyMap.apply() 

339 self._populateSkyMapDicts(name, instance) 

340 self._usedSkyPix = set() 

341 self.translatorFactory = self.instrument.makeDataIdTranslatorFactory() 

342 self.translatorFactory.log = self.log.getChild("translators") 

343 

344 def _populateSkyMapDicts(self, name, instance): 

345 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

346 self._configuredSkyMapsBySha1[struct.sha1] = struct 

347 self._configuredSkyMapsByName[struct.name] = struct 

348 

349 def isDatasetTypeIncluded(self, datasetTypeName: str): 

350 """Return `True` if configuration indicates that the given dataset type 

351 should be converted. 

352 

353 This method is intended to be called primarily by the 

354 `RepoConverter` instances used interally by the task. 

355 

356 Parameters 

357 ---------- 

358 datasetTypeName: str 

359 Name of the dataset type. 

360 

361 Returns 

362 ------- 

363 included : `bool` 

364 Whether the dataset should be included in the conversion. 

365 """ 

366 return ( 

367 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

368 for pattern in self.config.datasetIncludePatterns) 

369 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

370 for pattern in self.config.datasetIgnorePatterns) 

371 ) 

372 

373 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str: 

374 """Indicate that a repository uses the given SkyMap. 

375 

376 This method is intended to be called primarily by the 

377 `RepoConverter` instances used interally by the task. 

378 

379 Parameters 

380 ---------- 

381 skyMap : `lsst.skymap.BaseSkyMap` 

382 SkyMap instance being used, typically retrieved from a Gen2 

383 data repository. 

384 skyMapName : `str` 

385 The name of the gen2 skymap, for error reporting. 

386 

387 Returns 

388 ------- 

389 name : `str` 

390 The name of the skymap in Gen3 data IDs. 

391 

392 Raises 

393 ------ 

394 LookupError 

395 Raised if the specified skymap cannot be found. 

396 """ 

397 sha1 = skyMap.getSha1() 

398 if sha1 not in self._configuredSkyMapsBySha1: 

399 self._populateSkyMapDicts(skyMapName, skyMap) 

400 try: 

401 struct = self._configuredSkyMapsBySha1[sha1] 

402 except KeyError as err: 

403 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration." 

404 raise LookupError(msg) from err 

405 struct.used = True 

406 return struct.name 

407 

408 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

409 """Register all skymaps that have been marked as used. 

410 

411 This method is intended to be called primarily by the 

412 `RepoConverter` instances used interally by the task. 

413 

414 Parameters 

415 ---------- 

416 subset : `ConversionSubset`, optional 

417 Object that will be used to filter converted datasets by data ID. 

418 If given, it will be updated with the tracts of this skymap that 

419 overlap the visits in the subset. 

420 """ 

421 for struct in self._configuredSkyMapsBySha1.values(): 

422 if struct.used: 

423 struct.instance.register(struct.name, self.registry) 

424 if subset is not None and self.config.relatedOnly: 

425 subset.addSkyMap(self.registry, struct.name) 

426 

427 def useSkyPix(self, dimension: SkyPixDimension): 

428 """Indicate that a repository uses the given SkyPix dimension. 

429 

430 This method is intended to be called primarily by the 

431 `RepoConverter` instances used interally by the task. 

432 

433 Parameters 

434 ---------- 

435 dimension : `lsst.daf.butler.SkyPixDimension` 

436 Dimension represening a pixelization of the sky. 

437 """ 

438 self._usedSkyPix.add(dimension) 

439 

440 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

441 """Register all skymaps that have been marked as used. 

442 

443 This method is intended to be called primarily by the 

444 `RepoConverter` instances used interally by the task. 

445 

446 Parameters 

447 ---------- 

448 subset : `ConversionSubset`, optional 

449 Object that will be used to filter converted datasets by data ID. 

450 If given, it will be updated with the pixelization IDs that 

451 overlap the visits in the subset. 

452 """ 

453 if subset is not None and self.config.relatedOnly: 

454 for dimension in self._usedSkyPix: 

455 subset.addSkyPix(self.registry, dimension) 

456 

457 def run(self, root: str, *, 

458 calibs: Dict[str, str] = None, 

459 reruns: List[Rerun], 

460 visits: Optional[Iterable[int]] = None): 

461 """Convert a group of related data repositories. 

462 

463 Parameters 

464 ---------- 

465 root : `str` 

466 Complete path to the root Gen2 data repository. This should be 

467 a data repository that includes a Gen2 registry and any raw files 

468 and/or reference catalogs. 

469 calibs : `dict` 

470 Dictionary mapping calibration repository path to the 

471 `~lsst.daf.butler.CollectionType.RUN` collection that converted 

472 datasets within it should be inserted into. 

473 reruns : `list` of `Rerun` 

474 Specifications for rerun (processing output) collections to 

475 convert. 

476 visits : iterable of `int`, optional 

477 The integer IDs of visits to convert. If not provided, all visits 

478 in the Gen2 root repository will be converted. 

479 """ 

480 if calibs is None: 

481 calibs = {} 

482 if visits is not None: 

483 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

484 else: 

485 if self.config.relatedOnly: 

486 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

487 "no filtering will be done.") 

488 subset = None 

489 

490 # Make converters for all Gen2 repos. 

491 converters = [] 

492 rootConverter = RootRepoConverter(task=self, root=root, subset=subset) 

493 converters.append(rootConverter) 

494 for calibRoot, run in calibs.items(): 

495 if not os.path.isabs(calibRoot): 

496 calibRoot = os.path.join(rootConverter.root, calibRoot) 

497 converter = CalibRepoConverter(task=self, root=calibRoot, run=run, 

498 mapper=rootConverter.mapper, 

499 subset=rootConverter.subset) 

500 converters.append(converter) 

501 for spec in reruns: 

502 runRoot = spec.path 

503 if not os.path.isabs(runRoot): 

504 runRoot = os.path.join(rootConverter.root, runRoot) 

505 converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName, 

506 subset=rootConverter.subset) 

507 converters.append(converter) 

508 

509 # Register the instrument if we're configured to do so. 

510 if self.config.doRegisterInstrument: 

511 # Allow registration to fail on the assumption that this means 

512 # we are reusing a butler 

513 try: 

514 self.instrument.register(self.registry) 

515 except Exception: 

516 pass 

517 

518 # Run raw ingest (does nothing if we weren't configured to convert the 

519 # 'raw' dataset type). 

520 rootConverter.runRawIngest() 

521 

522 # Write curated calibrations to all calibration repositories. 

523 # Add new collections to the list of collections the butler was 

524 # initialized to pass to DefineVisitsTask, to deal with the (likely) 

525 # case the only 'camera' dataset in the repo will be one we're adding 

526 # here. 

527 if self.config.doWriteCuratedCalibrations: 

528 for run in calibs.values(): 

529 butler3 = Butler3(butler=self.butler3, run=run) 

530 self.instrument.writeCuratedCalibrations(butler3) 

531 

532 # Define visits (also does nothing if we weren't configurd to convert 

533 # the 'raw' dataset type). 

534 rootConverter.runDefineVisits() 

535 

536 # Walk Gen2 repos to find datasets convert. 

537 for converter in converters: 

538 converter.prep() 

539 

540 # Insert dimensions needed by any converters. In practice this is just 

541 # calibration_labels right now, because exposures and visits (and 

542 # things related to them) are handled by RawIngestTask and 

543 # DefineVisitsTask earlier and skymaps are handled later. 

544 # 

545 # Note that we do not try to filter dimensions down to just those 

546 # related to the given visits, even if config.relatedOnly is True; we 

547 # need them in the Gen3 repo in order to be able to know which datasets 

548 # to convert, because Gen2 alone doesn't know enough about the 

549 # relationships between data IDs. 

550 for converter in converters: 

551 converter.insertDimensionData() 

552 

553 # Insert dimensions that are potentially shared by all Gen2 

554 # repositories (and are hence managed directly by the Task, rather 

555 # than a converter instance). 

556 # This also finishes setting up the (shared) converter.subsets object 

557 # that is used to filter data IDs for config.relatedOnly. 

558 self.registerUsedSkyMaps(rootConverter.subset) 

559 self.registerUsedSkyPix(rootConverter.subset) 

560 

561 # Look for datasets, generally by scanning the filesystem. 

562 # This requires dimensions to have already been inserted so we can use 

563 # dimension information to identify related datasets. 

564 for converter in converters: 

565 converter.findDatasets() 

566 

567 # Expand data IDs. 

568 for converter in converters: 

569 converter.expandDataIds() 

570 

571 # Actually ingest datasets. 

572 for converter in converters: 

573 converter.ingest() 

574 

575 # Add chained collections for reruns. 

576 for spec in reruns: 

577 if spec.chainName is not None: 

578 self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED) 

579 chain = [spec.runName] 

580 chain.extend(spec.parents) 

581 chain.extend(rootConverter.getCollectionChain()) 

582 self.log.info("Defining %s from chain %s.", spec.chainName, chain) 

583 self.butler3.registry.setCollectionChain(spec.chainName, chain)