Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from typing import Iterable, Optional, List, Dict 

29 

30from lsst.utils import doImport 

31from lsst.daf.butler import ( 

32 Butler as Butler3, 

33 CollectionType, 

34 SkyPixDimension 

35) 

36from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

37from lsst.pipe.base import Task 

38from lsst.skymap import skyMapRegistry, BaseSkyMap 

39 

40from ..ingest import RawIngestTask 

41from ..defineVisits import DefineVisitsTask 

42from .repoConverter import ConversionSubset 

43from .rootRepoConverter import RootRepoConverter 

44from .calibRepoConverter import CalibRepoConverter 

45from .standardRepoConverter import StandardRepoConverter 

46 

47 

48@dataclass 

49class ConfiguredSkyMap: 

50 """Struct containing information about a skymap that may appear in a Gen2 

51 repository. 

52 """ 

53 

54 name: str 

55 """Name of the skymap used in Gen3 data IDs. 

56 """ 

57 

58 sha1: bytes 

59 """Hash computed by `BaseSkyMap.getSha1`. 

60 """ 

61 

62 instance: BaseSkyMap 

63 """Name of the skymap used in Gen3 data IDs. 

64 """ 

65 

66 used: bool = False 

67 """Whether this skymap has been found in at least one repository being 

68 converted. 

69 """ 

70 

71 

72@dataclass 

73class Rerun: 

74 """Specification for a Gen2 processing-output repository to convert. 

75 """ 

76 

77 path: str 

78 """Absolute or relative (to the root repository) path to the Gen2 

79 repository (`str`). 

80 """ 

81 

82 runName: str 

83 """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets 

84 will be inserted into (`str`). 

85 """ 

86 

87 chainName: Optional[str] 

88 """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will 

89 combine this repository's datasets with those of its parent repositories 

90 (`str`, optional). 

91 """ 

92 

93 parents: List[str] 

94 """Collection names associated with parent repositories, used to define the 

95 chained collection (`list` [ `str` ]). 

96 

97 Ignored if `chainName` is `None`. Runs used in the root repo are 

98 automatically included. 

99 """ 

100 

101 

102class ConvertRepoSkyMapConfig(Config): 

103 """Sub-config used to hold the parameters of a SkyMap. 

104 

105 Notes 

106 ----- 

107 This config only needs to exist because we can't put a 

108 `~lsst.pex.config.RegistryField` directly inside a 

109 `~lsst.pex.config.ConfigDictField`. 

110 

111 It needs to have its only field named "skyMap" for compatibility with the 

112 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

113 use one config file in an obs package to configure both. 

114 

115 This name leads to unfortunate repetition with the field named 

116 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

117 unavoidable. 

118 """ 

119 skyMap = skyMapRegistry.makeField( 

120 doc="Type and parameters for the SkyMap itself.", 

121 default="dodeca", 

122 ) 

123 

124 

125class ConvertRepoConfig(Config): 

126 raws = ConfigurableField( 

127 "Configuration for subtask responsible for ingesting raws and adding " 

128 "exposure dimension entries.", 

129 target=RawIngestTask, 

130 ) 

131 defineVisits = ConfigurableField( 

132 "Configuration for the subtask responsible for defining visits from " 

133 "exposures.", 

134 target=DefineVisitsTask, 

135 ) 

136 skyMaps = ConfigDictField( 

137 "Mapping from Gen3 skymap name to the parameters used to construct a " 

138 "BaseSkyMap instance. This will be used to associate names with " 

139 "existing skymaps found in the Gen2 repo.", 

140 keytype=str, 

141 itemtype=ConvertRepoSkyMapConfig, 

142 default={} 

143 ) 

144 rootSkyMapName = Field( 

145 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

146 "datasets in the root repository when no SkyMap is found there. ", 

147 dtype=str, 

148 optional=True, 

149 default=None, 

150 ) 

151 runs = DictField( 

152 "A mapping from dataset type name to the RUN collection they should " 

153 "be inserted into. This must include all datasets that can be found " 

154 "in the root repository; other repositories will use per-repository " 

155 "runs.", 

156 keytype=str, 

157 itemtype=str, 

158 default={ 

159 "deepCoadd_skyMap": "skymaps", 

160 "brightObjectMask": "masks", 

161 } 

162 ) 

163 storageClasses = DictField( 

164 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

165 "or 'persistable') to the Gen3 StorageClass name.", 

166 keytype=str, 

167 itemtype=str, 

168 default={ 

169 "bias": "ExposureF", 

170 "dark": "ExposureF", 

171 "flat": "ExposureF", 

172 "defects": "Defects", 

173 "BaseSkyMap": "SkyMap", 

174 "BaseCatalog": "Catalog", 

175 "BackgroundList": "Background", 

176 "raw": "Exposure", 

177 "MultilevelParquetTable": "DataFrame", 

178 "ParquetTable": "DataFrame", 

179 "SkyWcs": "Wcs", 

180 } 

181 ) 

182 formatterClasses = DictField( 

183 "Mapping from dataset type name to formatter class. " 

184 "By default these are derived from the formatters listed in the" 

185 " Gen3 datastore configuration.", 

186 keytype=str, 

187 itemtype=str, 

188 default={} 

189 ) 

190 targetHandlerClasses = DictField( 

191 "Mapping from dataset type name to target handler class.", 

192 keytype=str, 

193 itemtype=str, 

194 default={} 

195 ) 

196 doRegisterInstrument = Field( 

197 "If True (default), add dimension records for the Instrument and its " 

198 "filters and detectors to the registry instead of assuming they are " 

199 "already present.", 

200 dtype=bool, 

201 default=True, 

202 ) 

203 doWriteCuratedCalibrations = Field( 

204 "If True (default), ingest human-curated calibrations directly via " 

205 "the Instrument interface. Note that these calibrations are never " 

206 "converted from Gen2 repositories.", 

207 dtype=bool, 

208 default=True, 

209 ) 

210 refCats = ListField( 

211 "The names of reference catalogs (subdirectories under ref_cats) to " 

212 "be converted", 

213 dtype=str, 

214 default=[] 

215 ) 

216 fileIgnorePatterns = ListField( 

217 "Filename globs that should be ignored instead of being treated as " 

218 "datasets.", 

219 dtype=str, 

220 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

221 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

222 "_parent", "repositoryCfg.yaml"] 

223 ) 

224 rawDatasetType = Field( 

225 "Gen2 dataset type to use for raw data.", 

226 dtype=str, 

227 default="raw", 

228 ) 

229 datasetIncludePatterns = ListField( 

230 "Glob-style patterns for dataset type names that should be converted.", 

231 dtype=str, 

232 default=["*"] 

233 ) 

234 datasetIgnorePatterns = ListField( 

235 "Glob-style patterns for dataset type names that should not be " 

236 "converted despite matching a pattern in datasetIncludePatterns.", 

237 dtype=str, 

238 default=[] 

239 ) 

240 ccdKey = Field( 

241 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

242 dtype=str, 

243 default="ccd", 

244 ) 

245 relatedOnly = Field( 

246 "If True (default), only convert datasets that are related to the " 

247 "ingested visits. Ignored unless a list of visits is passed to " 

248 "run().", 

249 dtype=bool, 

250 default=False, 

251 ) 

252 curatedCalibrations = ListField( 

253 "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` " 

254 "and thus should not be converted using the standard calibration " 

255 "conversion system.", 

256 dtype=str, 

257 default=["camera", 

258 "transmission_sensor", 

259 "transmission_filter", 

260 "transmission_optics", 

261 "transmission_atmosphere", 

262 "bfKernel"] 

263 ) 

264 instrument = Field( 

265 doc=("Fully-qualified Python name of the `Instrument` subclass for " 

266 "all converted datasets."), 

267 dtype=str, 

268 optional=False, 

269 default=None, 

270 ) 

271 

272 @property 

273 def transfer(self): 

274 return self.raws.transfer 

275 

276 @transfer.setter 

277 def transfer(self, value): 

278 self.raws.transfer = value 

279 

280 def setDefaults(self): 

281 self.transfer = None 

282 

283 # TODO: check that there are no collection overrides for curated 

284 # calibrations, since we don't have a good way to utilize them. 

285 

286 

287class ConvertRepoTask(Task): 

288 """A task that converts one or more related Gen2 data repositories to a 

289 single Gen3 data repository (with multiple collections). 

290 

291 Parameters 

292 ---------- 

293 config: `ConvertRepoConfig` 

294 Configuration for this task. 

295 butler3: `lsst.daf.butler.Butler` 

296 A writeable Gen3 Butler instance that represents the data repository 

297 that datasets will be ingested into. If the 'raw' dataset is 

298 configured to be included in the conversion, ``butler3.run`` should be 

299 set to the name of the collection raws should be ingested into, and 

300 ``butler3.collections`` should include a calibration collection from 

301 which the ``camera`` dataset can be loaded, unless a calibration repo 

302 is converted and ``doWriteCuratedCalibrations`` is `True`. 

303 **kwargs 

304 Other keyword arguments are forwarded to the `Task` constructor. 

305 

306 Notes 

307 ----- 

308 Most of the work of converting repositories is delegated to instances of 

309 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

310 only state that is relevant for all Gen2 repositories being ingested, while 

311 each `RepoConverter` instance holds only state relevant for the conversion 

312 of a single Gen2 repository. Both the task and the `RepoConverter` 

313 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

314 methods may only be called once on a particular instance. 

315 """ 

316 

317 ConfigClass = ConvertRepoConfig 

318 

319 _DefaultName = "convertRepo" 

320 

321 def __init__(self, config=None, *, butler3: Butler3, **kwargs): 

322 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

323 super().__init__(config, **kwargs) 

324 self.butler3 = butler3 

325 self.registry = self.butler3.registry 

326 self.universe = self.registry.dimensions 

327 if self.isDatasetTypeIncluded("raw"): 

328 self.makeSubtask("raws", butler=butler3) 

329 self.makeSubtask("defineVisits", butler=butler3) 

330 else: 

331 self.raws = None 

332 self.defineVisits = None 

333 self.instrument = doImport(self.config.instrument)() 

334 self._configuredSkyMapsBySha1 = {} 

335 self._configuredSkyMapsByName = {} 

336 for name, config in self.config.skyMaps.items(): 

337 instance = config.skyMap.apply() 

338 self._populateSkyMapDicts(name, instance) 

339 self._usedSkyPix = set() 

340 self.translatorFactory = self.instrument.makeDataIdTranslatorFactory() 

341 self.translatorFactory.log = self.log.getChild("translators") 

342 

343 def _populateSkyMapDicts(self, name, instance): 

344 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

345 self._configuredSkyMapsBySha1[struct.sha1] = struct 

346 self._configuredSkyMapsByName[struct.name] = struct 

347 

348 def isDatasetTypeIncluded(self, datasetTypeName: str): 

349 """Return `True` if configuration indicates that the given dataset type 

350 should be converted. 

351 

352 This method is intended to be called primarily by the 

353 `RepoConverter` instances used interally by the task. 

354 

355 Parameters 

356 ---------- 

357 datasetTypeName: str 

358 Name of the dataset type. 

359 

360 Returns 

361 ------- 

362 included : `bool` 

363 Whether the dataset should be included in the conversion. 

364 """ 

365 return ( 

366 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

367 for pattern in self.config.datasetIncludePatterns) 

368 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

369 for pattern in self.config.datasetIgnorePatterns) 

370 ) 

371 

372 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str: 

373 """Indicate that a repository uses the given SkyMap. 

374 

375 This method is intended to be called primarily by the 

376 `RepoConverter` instances used interally by the task. 

377 

378 Parameters 

379 ---------- 

380 skyMap : `lsst.skymap.BaseSkyMap` 

381 SkyMap instance being used, typically retrieved from a Gen2 

382 data repository. 

383 skyMapName : `str` 

384 The name of the gen2 skymap, for error reporting. 

385 

386 Returns 

387 ------- 

388 name : `str` 

389 The name of the skymap in Gen3 data IDs. 

390 

391 Raises 

392 ------ 

393 LookupError 

394 Raised if the specified skymap cannot be found. 

395 """ 

396 sha1 = skyMap.getSha1() 

397 if sha1 not in self._configuredSkyMapsBySha1: 

398 self._populateSkyMapDicts(skyMapName, skyMap) 

399 try: 

400 struct = self._configuredSkyMapsBySha1[sha1] 

401 except KeyError as err: 

402 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration." 

403 raise LookupError(msg) from err 

404 struct.used = True 

405 return struct.name 

406 

407 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

408 """Register all skymaps that have been marked as used. 

409 

410 This method is intended to be called primarily by the 

411 `RepoConverter` instances used interally by the task. 

412 

413 Parameters 

414 ---------- 

415 subset : `ConversionSubset`, optional 

416 Object that will be used to filter converted datasets by data ID. 

417 If given, it will be updated with the tracts of this skymap that 

418 overlap the visits in the subset. 

419 """ 

420 for struct in self._configuredSkyMapsBySha1.values(): 

421 if struct.used: 

422 struct.instance.register(struct.name, self.registry) 

423 if subset is not None and self.config.relatedOnly: 

424 subset.addSkyMap(self.registry, struct.name) 

425 

426 def useSkyPix(self, dimension: SkyPixDimension): 

427 """Indicate that a repository uses the given SkyPix dimension. 

428 

429 This method is intended to be called primarily by the 

430 `RepoConverter` instances used interally by the task. 

431 

432 Parameters 

433 ---------- 

434 dimension : `lsst.daf.butler.SkyPixDimension` 

435 Dimension represening a pixelization of the sky. 

436 """ 

437 self._usedSkyPix.add(dimension) 

438 

439 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

440 """Register all skymaps that have been marked as used. 

441 

442 This method is intended to be called primarily by the 

443 `RepoConverter` instances used interally by the task. 

444 

445 Parameters 

446 ---------- 

447 subset : `ConversionSubset`, optional 

448 Object that will be used to filter converted datasets by data ID. 

449 If given, it will be updated with the pixelization IDs that 

450 overlap the visits in the subset. 

451 """ 

452 if subset is not None and self.config.relatedOnly: 

453 for dimension in self._usedSkyPix: 

454 subset.addSkyPix(self.registry, dimension) 

455 

456 def run(self, root: str, *, 

457 calibs: Dict[str, str] = None, 

458 reruns: List[Rerun], 

459 visits: Optional[Iterable[int]] = None): 

460 """Convert a group of related data repositories. 

461 

462 Parameters 

463 ---------- 

464 root : `str` 

465 Complete path to the root Gen2 data repository. This should be 

466 a data repository that includes a Gen2 registry and any raw files 

467 and/or reference catalogs. 

468 calibs : `dict` 

469 Dictionary mapping calibration repository path to the 

470 `~lsst.daf.butler.CollectionType.RUN` collection that converted 

471 datasets within it should be inserted into. 

472 reruns : `list` of `Rerun` 

473 Specifications for rerun (processing output) collections to 

474 convert. 

475 visits : iterable of `int`, optional 

476 The integer IDs of visits to convert. If not provided, all visits 

477 in the Gen2 root repository will be converted. 

478 """ 

479 if calibs is None: 

480 calibs = {} 

481 if visits is not None: 

482 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

483 else: 

484 if self.config.relatedOnly: 

485 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

486 "no filtering will be done.") 

487 subset = None 

488 

489 # Make converters for all Gen2 repos. 

490 converters = [] 

491 rootConverter = RootRepoConverter(task=self, root=root, subset=subset) 

492 converters.append(rootConverter) 

493 for calibRoot, run in calibs.items(): 

494 if not os.path.isabs(calibRoot): 

495 calibRoot = os.path.join(rootConverter.root, calibRoot) 

496 converter = CalibRepoConverter(task=self, root=calibRoot, run=run, 

497 mapper=rootConverter.mapper, 

498 subset=rootConverter.subset) 

499 converters.append(converter) 

500 for spec in reruns: 

501 runRoot = spec.path 

502 if not os.path.isabs(runRoot): 

503 runRoot = os.path.join(rootConverter.root, runRoot) 

504 converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName, 

505 subset=rootConverter.subset) 

506 converters.append(converter) 

507 

508 # Register the instrument if we're configured to do so. 

509 if self.config.doRegisterInstrument: 

510 # Allow registration to fail on the assumption that this means 

511 # we are reusing a butler 

512 try: 

513 self.instrument.register(self.registry) 

514 except Exception: 

515 pass 

516 

517 # Run raw ingest (does nothing if we weren't configured to convert the 

518 # 'raw' dataset type). 

519 rootConverter.runRawIngest() 

520 

521 # Write curated calibrations to all calibration repositories. 

522 # Add new collections to the list of collections the butler was 

523 # initialized to pass to DefineVisitsTask, to deal with the (likely) 

524 # case the only 'camera' dataset in the repo will be one we're adding 

525 # here. 

526 if self.config.doWriteCuratedCalibrations: 

527 for run in calibs.values(): 

528 butler3 = Butler3(butler=self.butler3, run=run) 

529 self.instrument.writeCuratedCalibrations(butler3) 

530 

531 # Define visits (also does nothing if we weren't configurd to convert 

532 # the 'raw' dataset type). 

533 rootConverter.runDefineVisits() 

534 

535 # Walk Gen2 repos to find datasets convert. 

536 for converter in converters: 

537 converter.prep() 

538 

539 # Insert dimensions needed by any converters. In practice this is just 

540 # calibration_labels right now, because exposures and visits (and 

541 # things related to them) are handled by RawIngestTask and 

542 # DefineVisitsTask earlier and skymaps are handled later. 

543 # 

544 # Note that we do not try to filter dimensions down to just those 

545 # related to the given visits, even if config.relatedOnly is True; we 

546 # need them in the Gen3 repo in order to be able to know which datasets 

547 # to convert, because Gen2 alone doesn't know enough about the 

548 # relationships between data IDs. 

549 for converter in converters: 

550 converter.insertDimensionData() 

551 

552 # Insert dimensions that are potentially shared by all Gen2 

553 # repositories (and are hence managed directly by the Task, rather 

554 # than a converter instance). 

555 # This also finishes setting up the (shared) converter.subsets object 

556 # that is used to filter data IDs for config.relatedOnly. 

557 self.registerUsedSkyMaps(rootConverter.subset) 

558 self.registerUsedSkyPix(rootConverter.subset) 

559 

560 # Look for datasets, generally by scanning the filesystem. 

561 # This requires dimensions to have already been inserted so we can use 

562 # dimension information to identify related datasets. 

563 for converter in converters: 

564 converter.findDatasets() 

565 

566 # Expand data IDs. 

567 for converter in converters: 

568 converter.expandDataIds() 

569 

570 # Actually ingest datasets. 

571 for converter in converters: 

572 converter.ingest() 

573 

574 # Add chained collections for reruns. 

575 for spec in reruns: 

576 if spec.chainName is not None: 

577 self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED) 

578 chain = [spec.runName] 

579 chain.extend(spec.parents) 

580 chain.extend(rootConverter.getCollectionChain()) 

581 self.log.info("Defining %s from chain %s.", spec.chainName, chain) 

582 self.butler3.registry.setCollectionChain(spec.chainName, chain)