Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from typing import Iterable, Optional, List, Dict 

29 

30from lsst.daf.butler import ( 

31 Butler as Butler3, 

32 CollectionType, 

33 SkyPixDimension 

34) 

35from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

36from lsst.pipe.base import Task 

37from lsst.skymap import skyMapRegistry, BaseSkyMap 

38 

39from ..ingest import RawIngestTask 

40from ..defineVisits import DefineVisitsTask 

41from .repoConverter import ConversionSubset 

42from .rootRepoConverter import RootRepoConverter 

43from .calibRepoConverter import CalibRepoConverter 

44from .standardRepoConverter import StandardRepoConverter 

45from .._instrument import Instrument 

46 

47 

48@dataclass 

49class ConfiguredSkyMap: 

50 """Struct containing information about a skymap that may appear in a Gen2 

51 repository. 

52 """ 

53 

54 name: str 

55 """Name of the skymap used in Gen3 data IDs. 

56 """ 

57 

58 sha1: bytes 

59 """Hash computed by `BaseSkyMap.getSha1`. 

60 """ 

61 

62 instance: BaseSkyMap 

63 """Name of the skymap used in Gen3 data IDs. 

64 """ 

65 

66 used: bool = False 

67 """Whether this skymap has been found in at least one repository being 

68 converted. 

69 """ 

70 

71 

72@dataclass 

73class Rerun: 

74 """Specification for a Gen2 processing-output repository to convert. 

75 """ 

76 

77 path: str 

78 """Absolute or relative (to the root repository) path to the Gen2 

79 repository (`str`). 

80 """ 

81 

82 runName: str 

83 """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets 

84 will be inserted into (`str`). 

85 """ 

86 

87 chainName: Optional[str] 

88 """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will 

89 combine this repository's datasets with those of its parent repositories 

90 (`str`, optional). 

91 """ 

92 

93 parents: List[str] 

94 """Collection names associated with parent repositories, used to define the 

95 chained collection (`list` [ `str` ]). 

96 

97 Ignored if `chainName` is `None`. Runs used in the root repo are 

98 automatically included. 

99 """ 

100 

101 

102class ConvertRepoSkyMapConfig(Config): 

103 """Sub-config used to hold the parameters of a SkyMap. 

104 

105 Notes 

106 ----- 

107 This config only needs to exist because we can't put a 

108 `~lsst.pex.config.RegistryField` directly inside a 

109 `~lsst.pex.config.ConfigDictField`. 

110 

111 It needs to have its only field named "skyMap" for compatibility with the 

112 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

113 use one config file in an obs package to configure both. 

114 

115 This name leads to unfortunate repetition with the field named 

116 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

117 unavoidable. 

118 """ 

119 skyMap = skyMapRegistry.makeField( 

120 doc="Type and parameters for the SkyMap itself.", 

121 default="dodeca", 

122 ) 

123 

124 

125class ConvertRepoConfig(Config): 

126 raws = ConfigurableField( 

127 "Configuration for subtask responsible for ingesting raws and adding " 

128 "exposure dimension entries.", 

129 target=RawIngestTask, 

130 ) 

131 defineVisits = ConfigurableField( 

132 "Configuration for the subtask responsible for defining visits from " 

133 "exposures.", 

134 target=DefineVisitsTask, 

135 ) 

136 skyMaps = ConfigDictField( 

137 "Mapping from Gen3 skymap name to the parameters used to construct a " 

138 "BaseSkyMap instance. This will be used to associate names with " 

139 "existing skymaps found in the Gen2 repo.", 

140 keytype=str, 

141 itemtype=ConvertRepoSkyMapConfig, 

142 default={} 

143 ) 

144 rootSkyMapName = Field( 

145 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

146 "datasets in the root repository when no SkyMap is found there. ", 

147 dtype=str, 

148 optional=True, 

149 default=None, 

150 ) 

151 runs = DictField( 

152 "A mapping from dataset type name to the RUN collection they should " 

153 "be inserted into. This must include all datasets that can be found " 

154 "in the root repository; other repositories will use per-repository " 

155 "runs.", 

156 keytype=str, 

157 itemtype=str, 

158 default={ 

159 "deepCoadd_skyMap": "skymaps", 

160 "brightObjectMask": "masks", 

161 } 

162 ) 

163 storageClasses = DictField( 

164 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

165 "or 'persistable') to the Gen3 StorageClass name.", 

166 keytype=str, 

167 itemtype=str, 

168 default={ 

169 "bias": "ExposureF", 

170 "dark": "ExposureF", 

171 "flat": "ExposureF", 

172 "defects": "Defects", 

173 "crosstalk": "CrosstalkCalib", 

174 "BaseSkyMap": "SkyMap", 

175 "BaseCatalog": "Catalog", 

176 "BackgroundList": "Background", 

177 "raw": "Exposure", 

178 "MultilevelParquetTable": "DataFrame", 

179 "ParquetTable": "DataFrame", 

180 "SkyWcs": "Wcs", 

181 } 

182 ) 

183 formatterClasses = DictField( 

184 "Mapping from dataset type name to formatter class. " 

185 "By default these are derived from the formatters listed in the" 

186 " Gen3 datastore configuration.", 

187 keytype=str, 

188 itemtype=str, 

189 default={} 

190 ) 

191 targetHandlerClasses = DictField( 

192 "Mapping from dataset type name to target handler class.", 

193 keytype=str, 

194 itemtype=str, 

195 default={} 

196 ) 

197 doRegisterInstrument = Field( 

198 "If True (default), add dimension records for the Instrument and its " 

199 "filters and detectors to the registry instead of assuming they are " 

200 "already present.", 

201 dtype=bool, 

202 default=True, 

203 ) 

204 doWriteCuratedCalibrations = Field( 

205 "If True (default), ingest human-curated calibrations directly via " 

206 "the Instrument interface. Note that these calibrations are never " 

207 "converted from Gen2 repositories.", 

208 dtype=bool, 

209 default=True, 

210 ) 

211 refCats = ListField( 

212 "The names of reference catalogs (subdirectories under ref_cats) to " 

213 "be converted", 

214 dtype=str, 

215 default=[] 

216 ) 

217 fileIgnorePatterns = ListField( 

218 "Filename globs that should be ignored instead of being treated as " 

219 "datasets.", 

220 dtype=str, 

221 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

222 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

223 "_parent", "repositoryCfg.yaml"] 

224 ) 

225 rawDatasetType = Field( 

226 "Gen2 dataset type to use for raw data.", 

227 dtype=str, 

228 default="raw", 

229 ) 

230 datasetIncludePatterns = ListField( 

231 "Glob-style patterns for dataset type names that should be converted.", 

232 dtype=str, 

233 default=["*"] 

234 ) 

235 datasetIgnorePatterns = ListField( 

236 "Glob-style patterns for dataset type names that should not be " 

237 "converted despite matching a pattern in datasetIncludePatterns.", 

238 dtype=str, 

239 default=[] 

240 ) 

241 ccdKey = Field( 

242 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

243 dtype=str, 

244 default="ccd", 

245 ) 

246 relatedOnly = Field( 

247 "If True (default), only convert datasets that are related to the " 

248 "ingested visits. Ignored unless a list of visits is passed to " 

249 "run().", 

250 dtype=bool, 

251 default=False, 

252 ) 

253 

254 @property 

255 def transfer(self): 

256 return self.raws.transfer 

257 

258 @transfer.setter 

259 def transfer(self, value): 

260 self.raws.transfer = value 

261 

262 def setDefaults(self): 

263 self.transfer = None 

264 

265 # TODO: check that there are no collection overrides for curated 

266 # calibrations, since we don't have a good way to utilize them. 

267 

268 

269class ConvertRepoTask(Task): 

270 """A task that converts one or more related Gen2 data repositories to a 

271 single Gen3 data repository (with multiple collections). 

272 

273 Parameters 

274 ---------- 

275 config: `ConvertRepoConfig` 

276 Configuration for this task. 

277 butler3: `lsst.daf.butler.Butler` 

278 A writeable Gen3 Butler instance that represents the data repository 

279 that datasets will be ingested into. If the 'raw' dataset is 

280 configured to be included in the conversion, ``butler3.run`` should be 

281 set to the name of the collection raws should be ingested into, and 

282 ``butler3.collections`` should include a calibration collection from 

283 which the ``camera`` dataset can be loaded, unless a calibration repo 

284 is converted and ``doWriteCuratedCalibrations`` is `True`. 

285 **kwargs 

286 Other keyword arguments are forwarded to the `Task` constructor. 

287 

288 Notes 

289 ----- 

290 Most of the work of converting repositories is delegated to instances of 

291 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

292 only state that is relevant for all Gen2 repositories being ingested, while 

293 each `RepoConverter` instance holds only state relevant for the conversion 

294 of a single Gen2 repository. Both the task and the `RepoConverter` 

295 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

296 methods may only be called once on a particular instance. 

297 """ 

298 

299 ConfigClass = ConvertRepoConfig 

300 

301 _DefaultName = "convertRepo" 

302 

303 def __init__(self, config=None, *, butler3: Butler3, instrument: Instrument, **kwargs): 

304 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

305 super().__init__(config, **kwargs) 

306 self.butler3 = butler3 

307 self.registry = self.butler3.registry 

308 self.universe = self.registry.dimensions 

309 if self.isDatasetTypeIncluded("raw"): 

310 self.makeSubtask("raws", butler=butler3) 

311 self.makeSubtask("defineVisits", butler=butler3) 

312 else: 

313 self.raws = None 

314 self.defineVisits = None 

315 self.instrument = instrument 

316 self._configuredSkyMapsBySha1 = {} 

317 self._configuredSkyMapsByName = {} 

318 for name, config in self.config.skyMaps.items(): 

319 instance = config.skyMap.apply() 

320 self._populateSkyMapDicts(name, instance) 

321 self._usedSkyPix = set() 

322 self.translatorFactory = self.instrument.makeDataIdTranslatorFactory() 

323 self.translatorFactory.log = self.log.getChild("translators") 

324 

325 def _populateSkyMapDicts(self, name, instance): 

326 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

327 self._configuredSkyMapsBySha1[struct.sha1] = struct 

328 self._configuredSkyMapsByName[struct.name] = struct 

329 

330 def isDatasetTypeIncluded(self, datasetTypeName: str): 

331 """Return `True` if configuration indicates that the given dataset type 

332 should be converted. 

333 

334 This method is intended to be called primarily by the 

335 `RepoConverter` instances used interally by the task. 

336 

337 Parameters 

338 ---------- 

339 datasetTypeName: str 

340 Name of the dataset type. 

341 

342 Returns 

343 ------- 

344 included : `bool` 

345 Whether the dataset should be included in the conversion. 

346 """ 

347 return ( 

348 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

349 for pattern in self.config.datasetIncludePatterns) 

350 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

351 for pattern in self.config.datasetIgnorePatterns) 

352 ) 

353 

354 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str: 

355 """Indicate that a repository uses the given SkyMap. 

356 

357 This method is intended to be called primarily by the 

358 `RepoConverter` instances used interally by the task. 

359 

360 Parameters 

361 ---------- 

362 skyMap : `lsst.skymap.BaseSkyMap` 

363 SkyMap instance being used, typically retrieved from a Gen2 

364 data repository. 

365 skyMapName : `str` 

366 The name of the gen2 skymap, for error reporting. 

367 

368 Returns 

369 ------- 

370 name : `str` 

371 The name of the skymap in Gen3 data IDs. 

372 

373 Raises 

374 ------ 

375 LookupError 

376 Raised if the specified skymap cannot be found. 

377 """ 

378 sha1 = skyMap.getSha1() 

379 if sha1 not in self._configuredSkyMapsBySha1: 

380 self._populateSkyMapDicts(skyMapName, skyMap) 

381 try: 

382 struct = self._configuredSkyMapsBySha1[sha1] 

383 except KeyError as err: 

384 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration." 

385 raise LookupError(msg) from err 

386 struct.used = True 

387 return struct.name 

388 

389 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

390 """Register all skymaps that have been marked as used. 

391 

392 This method is intended to be called primarily by the 

393 `RepoConverter` instances used interally by the task. 

394 

395 Parameters 

396 ---------- 

397 subset : `ConversionSubset`, optional 

398 Object that will be used to filter converted datasets by data ID. 

399 If given, it will be updated with the tracts of this skymap that 

400 overlap the visits in the subset. 

401 """ 

402 for struct in self._configuredSkyMapsBySha1.values(): 

403 if struct.used: 

404 struct.instance.register(struct.name, self.registry) 

405 if subset is not None and self.config.relatedOnly: 

406 subset.addSkyMap(self.registry, struct.name) 

407 

408 def useSkyPix(self, dimension: SkyPixDimension): 

409 """Indicate that a repository uses the given SkyPix dimension. 

410 

411 This method is intended to be called primarily by the 

412 `RepoConverter` instances used interally by the task. 

413 

414 Parameters 

415 ---------- 

416 dimension : `lsst.daf.butler.SkyPixDimension` 

417 Dimension represening a pixelization of the sky. 

418 """ 

419 self._usedSkyPix.add(dimension) 

420 

421 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

422 """Register all skymaps that have been marked as used. 

423 

424 This method is intended to be called primarily by the 

425 `RepoConverter` instances used interally by the task. 

426 

427 Parameters 

428 ---------- 

429 subset : `ConversionSubset`, optional 

430 Object that will be used to filter converted datasets by data ID. 

431 If given, it will be updated with the pixelization IDs that 

432 overlap the visits in the subset. 

433 """ 

434 if subset is not None and self.config.relatedOnly: 

435 for dimension in self._usedSkyPix: 

436 subset.addSkyPix(self.registry, dimension) 

437 

438 def run(self, root: str, *, 

439 calibs: Dict[str, str] = None, 

440 reruns: List[Rerun], 

441 visits: Optional[Iterable[int]] = None): 

442 """Convert a group of related data repositories. 

443 

444 Parameters 

445 ---------- 

446 root : `str` 

447 Complete path to the root Gen2 data repository. This should be 

448 a data repository that includes a Gen2 registry and any raw files 

449 and/or reference catalogs. 

450 calibs : `dict` 

451 Dictionary mapping calibration repository path to the 

452 `~lsst.daf.butler.CollectionType.CALIBRATION` collection that 

453 converted datasets within it should be certified into. 

454 reruns : `list` of `Rerun` 

455 Specifications for rerun (processing output) collections to 

456 convert. 

457 visits : iterable of `int`, optional 

458 The integer IDs of visits to convert. If not provided, all visits 

459 in the Gen2 root repository will be converted. 

460 """ 

461 if calibs is None: 

462 calibs = {} 

463 if visits is not None: 

464 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

465 else: 

466 if self.config.relatedOnly: 

467 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

468 "no filtering will be done.") 

469 subset = None 

470 

471 # Make converters for all Gen2 repos. 

472 converters = [] 

473 rootConverter = RootRepoConverter(task=self, root=root, subset=subset, instrument=self.instrument) 

474 converters.append(rootConverter) 

475 for calibRoot, collection in calibs.items(): 

476 if not os.path.isabs(calibRoot): 

477 calibRoot = os.path.join(rootConverter.root, calibRoot) 

478 converter = CalibRepoConverter(task=self, root=calibRoot, collection=collection, 

479 instrument=self.instrument, 

480 mapper=rootConverter.mapper, 

481 subset=rootConverter.subset) 

482 converters.append(converter) 

483 rerunConverters = {} 

484 for spec in reruns: 

485 runRoot = spec.path 

486 if not os.path.isabs(runRoot): 

487 runRoot = os.path.join(rootConverter.root, runRoot) 

488 converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName, 

489 instrument=self.instrument, subset=rootConverter.subset) 

490 converters.append(converter) 

491 rerunConverters[spec.runName] = converter 

492 

493 # Register the instrument if we're configured to do so. 

494 if self.config.doRegisterInstrument: 

495 # Allow registration to fail on the assumption that this means 

496 # we are reusing a butler 

497 try: 

498 self.instrument.register(self.registry) 

499 except Exception: 

500 pass 

501 

502 # Run raw ingest (does nothing if we weren't configured to convert the 

503 # 'raw' dataset type). 

504 rootConverter.runRawIngest() 

505 

506 # Write curated calibrations to all calibration runs and 

507 # also in the default collection. 

508 # Add new collections to the list of collections the butler was 

509 # initialized to pass to DefineVisitsTask, to deal with the (likely) 

510 # case the only 'camera' dataset in the repo will be one we're adding 

511 # here. 

512 if self.config.doWriteCuratedCalibrations: 

513 butler3 = Butler3(butler=self.butler3) 

514 # Write curated calibrations to any new calibration collections we 

515 # created by converting a Gen2 calibration repo. 

516 calibCollections = set() 

517 for collection in calibs.values(): 

518 self.instrument.writeCuratedCalibrations(butler3, collection=collection) 

519 calibCollections.add(collection) 

520 # Ensure that we have the curated calibrations even if there 

521 # is no calibration conversion. It's possible that the default 

522 # calib collection will have been specified (in fact the 

523 # butler convert script enforces that behavior for now) so 

524 # we check for the default situation 

525 # Assume we know the default rather than letting 

526 # writeCuratedCalibrations default itself 

527 defaultCalibCollection = self.instrument.makeCollectionName("calib") 

528 if defaultCalibCollection not in calibCollections: 

529 self.instrument.writeCuratedCalibrations(butler3, collection=defaultCalibCollection) 

530 

531 # Define visits (also does nothing if we weren't configurd to convert 

532 # the 'raw' dataset type). 

533 rootConverter.runDefineVisits() 

534 

535 # Walk Gen2 repos to find datasets convert. 

536 for converter in converters: 

537 converter.prep() 

538 

539 # Insert dimensions that are potentially shared by all Gen2 

540 # repositories (and are hence managed directly by the Task, rather 

541 # than a converter instance). 

542 # This also finishes setting up the (shared) converter.subsets object 

543 # that is used to filter data IDs for config.relatedOnly. 

544 self.registerUsedSkyMaps(rootConverter.subset) 

545 self.registerUsedSkyPix(rootConverter.subset) 

546 

547 # Look for datasets, generally by scanning the filesystem. 

548 # This requires dimensions to have already been inserted so we can use 

549 # dimension information to identify related datasets. 

550 for converter in converters: 

551 converter.findDatasets() 

552 

553 # Expand data IDs. 

554 for converter in converters: 

555 converter.expandDataIds() 

556 

557 # Actually ingest datasets. 

558 for converter in converters: 

559 converter.ingest() 

560 

561 # Perform any post-ingest processing. 

562 for converter in converters: 

563 converter.finish() 

564 

565 # Add chained collections for reruns. 

566 for spec in reruns: 

567 if spec.chainName is not None: 

568 self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED) 

569 chain = [spec.runName] 

570 chain.extend(rerunConverters[spec.runName].getCollectionChain()) 

571 for parent in spec.parents: 

572 chain.append(spec.parent) 

573 parentConverter = rerunConverters.get(parent) 

574 if parentConverter is not None: 

575 chain.extend(parentConverter.getCollectionChain()) 

576 chain.extend(rootConverter.getCollectionChain()) 

577 self.log.info("Defining %s from chain %s.", spec.chainName, chain) 

578 self.butler3.registry.setCollectionChain(spec.chainName, chain)