Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from multiprocessing import Pool 

29from typing import Iterable, Optional, List, Dict 

30 

31from lsst.daf.butler import ( 

32 Butler as Butler3, 

33 CollectionType, 

34 SkyPixDimension 

35) 

36from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

37from lsst.pipe.base import Task 

38from lsst.skymap import skyMapRegistry, BaseSkyMap 

39 

40from ..ingest import RawIngestTask 

41from ..defineVisits import DefineVisitsTask 

42from .repoConverter import ConversionSubset 

43from .rootRepoConverter import RootRepoConverter 

44from .calibRepoConverter import CalibRepoConverter 

45from .standardRepoConverter import StandardRepoConverter 

46from .._instrument import Instrument 

47 

48 

49@dataclass 

50class ConfiguredSkyMap: 

51 """Struct containing information about a skymap that may appear in a Gen2 

52 repository. 

53 """ 

54 

55 name: str 

56 """Name of the skymap used in Gen3 data IDs. 

57 """ 

58 

59 sha1: bytes 

60 """Hash computed by `BaseSkyMap.getSha1`. 

61 """ 

62 

63 instance: BaseSkyMap 

64 """Name of the skymap used in Gen3 data IDs. 

65 """ 

66 

67 used: bool = False 

68 """Whether this skymap has been found in at least one repository being 

69 converted. 

70 """ 

71 

72 

73@dataclass 

74class Rerun: 

75 """Specification for a Gen2 processing-output repository to convert. 

76 """ 

77 

78 path: str 

79 """Absolute or relative (to the root repository) path to the Gen2 

80 repository (`str`). 

81 """ 

82 

83 runName: str 

84 """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets 

85 will be inserted into (`str`). 

86 """ 

87 

88 chainName: Optional[str] 

89 """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will 

90 combine this repository's datasets with those of its parent repositories 

91 (`str`, optional). 

92 """ 

93 

94 parents: List[str] 

95 """Collection names associated with parent repositories, used to define the 

96 chained collection (`list` [ `str` ]). 

97 

98 Ignored if `chainName` is `None`. Runs used in the root repo are 

99 automatically included. 

100 """ 

101 

102 

103class ConvertRepoSkyMapConfig(Config): 

104 """Sub-config used to hold the parameters of a SkyMap. 

105 

106 Notes 

107 ----- 

108 This config only needs to exist because we can't put a 

109 `~lsst.pex.config.RegistryField` directly inside a 

110 `~lsst.pex.config.ConfigDictField`. 

111 

112 It needs to have its only field named "skyMap" for compatibility with the 

113 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

114 use one config file in an obs package to configure both. 

115 

116 This name leads to unfortunate repetition with the field named 

117 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

118 unavoidable. 

119 """ 

120 skyMap = skyMapRegistry.makeField( 

121 doc="Type and parameters for the SkyMap itself.", 

122 default="dodeca", 

123 ) 

124 

125 

126class ConvertRepoConfig(Config): 

127 raws = ConfigurableField( 

128 "Configuration for subtask responsible for ingesting raws and adding " 

129 "exposure dimension entries.", 

130 target=RawIngestTask, 

131 ) 

132 defineVisits = ConfigurableField( 

133 "Configuration for the subtask responsible for defining visits from " 

134 "exposures.", 

135 target=DefineVisitsTask, 

136 ) 

137 skyMaps = ConfigDictField( 

138 "Mapping from Gen3 skymap name to the parameters used to construct a " 

139 "BaseSkyMap instance. This will be used to associate names with " 

140 "existing skymaps found in the Gen2 repo.", 

141 keytype=str, 

142 itemtype=ConvertRepoSkyMapConfig, 

143 default={} 

144 ) 

145 rootSkyMapName = Field( 

146 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

147 "datasets in the root repository when no SkyMap is found there. ", 

148 dtype=str, 

149 optional=True, 

150 default=None, 

151 ) 

152 runs = DictField( 

153 "A mapping from dataset type name to the RUN collection they should " 

154 "be inserted into. This must include all datasets that can be found " 

155 "in the root repository; other repositories will use per-repository " 

156 "runs.", 

157 keytype=str, 

158 itemtype=str, 

159 default={ 

160 "deepCoadd_skyMap": "skymaps", 

161 } 

162 ) 

163 runsForced = DictField( 

164 "Like ``runs``, but is used even when the dataset is present in a " 

165 "non-root repository (i.e. rerun), overriding the non-root " 

166 "repository's main collection.", 

167 keytype=str, 

168 itemtype=str, 

169 default={ 

170 "brightObjectMask": "masks", 

171 } 

172 ) 

173 storageClasses = DictField( 

174 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

175 "or 'persistable') to the Gen3 StorageClass name.", 

176 keytype=str, 

177 itemtype=str, 

178 default={ 

179 "bias": "ExposureF", 

180 "dark": "ExposureF", 

181 "flat": "ExposureF", 

182 "defects": "Defects", 

183 "crosstalk": "CrosstalkCalib", 

184 "BaseSkyMap": "SkyMap", 

185 "BaseCatalog": "Catalog", 

186 "BackgroundList": "Background", 

187 "raw": "Exposure", 

188 "MultilevelParquetTable": "DataFrame", 

189 "ParquetTable": "DataFrame", 

190 "SkyWcs": "Wcs", 

191 } 

192 ) 

193 formatterClasses = DictField( 

194 "Mapping from dataset type name to formatter class. " 

195 "By default these are derived from the formatters listed in the" 

196 " Gen3 datastore configuration.", 

197 keytype=str, 

198 itemtype=str, 

199 default={} 

200 ) 

201 targetHandlerClasses = DictField( 

202 "Mapping from dataset type name to target handler class.", 

203 keytype=str, 

204 itemtype=str, 

205 default={} 

206 ) 

207 doRegisterInstrument = Field( 

208 "If True (default), add dimension records for the Instrument and its " 

209 "filters and detectors to the registry instead of assuming they are " 

210 "already present.", 

211 dtype=bool, 

212 default=True, 

213 ) 

214 doWriteCuratedCalibrations = Field( 

215 "If True (default), ingest human-curated calibrations directly via " 

216 "the Instrument interface. Note that these calibrations are never " 

217 "converted from Gen2 repositories.", 

218 dtype=bool, 

219 default=True, 

220 ) 

221 refCats = ListField( 

222 "The names of reference catalogs (subdirectories under ref_cats) to " 

223 "be converted", 

224 dtype=str, 

225 default=[] 

226 ) 

227 fileIgnorePatterns = ListField( 

228 "Filename globs that should be ignored instead of being treated as " 

229 "datasets.", 

230 dtype=str, 

231 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

232 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

233 "_parent", "repositoryCfg.yaml"] 

234 ) 

235 rawDatasetType = Field( 

236 "Gen2 dataset type to use for raw data.", 

237 dtype=str, 

238 default="raw", 

239 ) 

240 datasetIncludePatterns = ListField( 

241 "Glob-style patterns for dataset type names that should be converted.", 

242 dtype=str, 

243 default=["*"] 

244 ) 

245 datasetIgnorePatterns = ListField( 

246 "Glob-style patterns for dataset type names that should not be " 

247 "converted despite matching a pattern in datasetIncludePatterns.", 

248 dtype=str, 

249 default=[] 

250 ) 

251 ccdKey = Field( 

252 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

253 dtype=str, 

254 default="ccd", 

255 ) 

256 relatedOnly = Field( 

257 "If True (default), only convert datasets that are related to the " 

258 "ingested visits. Ignored unless a list of visits is passed to " 

259 "run().", 

260 dtype=bool, 

261 default=False, 

262 ) 

263 

264 @property 

265 def transfer(self): 

266 return self.raws.transfer 

267 

268 @transfer.setter 

269 def transfer(self, value): 

270 self.raws.transfer = value 

271 

272 def setDefaults(self): 

273 self.transfer = None 

274 

275 # TODO: check that there are no collection overrides for curated 

276 # calibrations, since we don't have a good way to utilize them. 

277 

278 

279class ConvertRepoTask(Task): 

280 """A task that converts one or more related Gen2 data repositories to a 

281 single Gen3 data repository (with multiple collections). 

282 

283 Parameters 

284 ---------- 

285 config: `ConvertRepoConfig` 

286 Configuration for this task. 

287 butler3: `lsst.daf.butler.Butler` 

288 A writeable Gen3 Butler instance that represents the data repository 

289 that datasets will be ingested into. If the 'raw' dataset is 

290 configured to be included in the conversion, ``butler3.run`` should be 

291 set to the name of the collection raws should be ingested into, and 

292 ``butler3.collections`` should include a calibration collection from 

293 which the ``camera`` dataset can be loaded, unless a calibration repo 

294 is converted and ``doWriteCuratedCalibrations`` is `True`. 

295 instrument : `lsst.obs.base.Instrument` 

296 The Gen3 instrument that should be used for this conversion. 

297 **kwargs 

298 Other keyword arguments are forwarded to the `Task` constructor. 

299 

300 Notes 

301 ----- 

302 Most of the work of converting repositories is delegated to instances of 

303 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

304 only state that is relevant for all Gen2 repositories being ingested, while 

305 each `RepoConverter` instance holds only state relevant for the conversion 

306 of a single Gen2 repository. Both the task and the `RepoConverter` 

307 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

308 methods may only be called once on a particular instance. 

309 """ 

310 

311 ConfigClass = ConvertRepoConfig 

312 

313 _DefaultName = "convertRepo" 

314 

315 def __init__(self, config=None, *, butler3: Butler3, instrument: Instrument, **kwargs): 

316 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

317 super().__init__(config, **kwargs) 

318 self.butler3 = butler3 

319 self.registry = self.butler3.registry 

320 self.universe = self.registry.dimensions 

321 if self.isDatasetTypeIncluded("raw"): 

322 self.makeSubtask("raws", butler=butler3) 

323 self.makeSubtask("defineVisits", butler=butler3) 

324 else: 

325 self.raws = None 

326 self.defineVisits = None 

327 self.instrument = instrument 

328 self._configuredSkyMapsBySha1 = {} 

329 self._configuredSkyMapsByName = {} 

330 for name, config in self.config.skyMaps.items(): 

331 instance = config.skyMap.apply() 

332 self._populateSkyMapDicts(name, instance) 

333 self._usedSkyPix = set() 

334 self.translatorFactory = self.instrument.makeDataIdTranslatorFactory() 

335 self.translatorFactory.log = self.log.getChild("translators") 

336 

337 def _reduce_kwargs(self): 

338 # Add extra parameters to pickle 

339 return dict(**super()._reduce_kwargs(), butler3=self.butler3, instrument=self.instrument) 

340 

341 def _populateSkyMapDicts(self, name, instance): 

342 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

343 self._configuredSkyMapsBySha1[struct.sha1] = struct 

344 self._configuredSkyMapsByName[struct.name] = struct 

345 

346 def isDatasetTypeIncluded(self, datasetTypeName: str): 

347 """Return `True` if configuration indicates that the given dataset type 

348 should be converted. 

349 

350 This method is intended to be called primarily by the 

351 `RepoConverter` instances used interally by the task. 

352 

353 Parameters 

354 ---------- 

355 datasetTypeName: str 

356 Name of the dataset type. 

357 

358 Returns 

359 ------- 

360 included : `bool` 

361 Whether the dataset should be included in the conversion. 

362 """ 

363 return ( 

364 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

365 for pattern in self.config.datasetIncludePatterns) 

366 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

367 for pattern in self.config.datasetIgnorePatterns) 

368 ) 

369 

370 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str: 

371 """Indicate that a repository uses the given SkyMap. 

372 

373 This method is intended to be called primarily by the 

374 `RepoConverter` instances used interally by the task. 

375 

376 Parameters 

377 ---------- 

378 skyMap : `lsst.skymap.BaseSkyMap` 

379 SkyMap instance being used, typically retrieved from a Gen2 

380 data repository. 

381 skyMapName : `str` 

382 The name of the gen2 skymap, for error reporting. 

383 

384 Returns 

385 ------- 

386 name : `str` 

387 The name of the skymap in Gen3 data IDs. 

388 

389 Raises 

390 ------ 

391 LookupError 

392 Raised if the specified skymap cannot be found. 

393 """ 

394 sha1 = skyMap.getSha1() 

395 if sha1 not in self._configuredSkyMapsBySha1: 

396 self._populateSkyMapDicts(skyMapName, skyMap) 

397 try: 

398 struct = self._configuredSkyMapsBySha1[sha1] 

399 except KeyError as err: 

400 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration." 

401 raise LookupError(msg) from err 

402 struct.used = True 

403 return struct.name 

404 

405 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

406 """Register all skymaps that have been marked as used. 

407 

408 This method is intended to be called primarily by the 

409 `RepoConverter` instances used interally by the task. 

410 

411 Parameters 

412 ---------- 

413 subset : `ConversionSubset`, optional 

414 Object that will be used to filter converted datasets by data ID. 

415 If given, it will be updated with the tracts of this skymap that 

416 overlap the visits in the subset. 

417 """ 

418 for struct in self._configuredSkyMapsBySha1.values(): 

419 if struct.used: 

420 struct.instance.register(struct.name, self.registry) 

421 if subset is not None and self.config.relatedOnly: 

422 subset.addSkyMap(self.registry, struct.name) 

423 

424 def useSkyPix(self, dimension: SkyPixDimension): 

425 """Indicate that a repository uses the given SkyPix dimension. 

426 

427 This method is intended to be called primarily by the 

428 `RepoConverter` instances used interally by the task. 

429 

430 Parameters 

431 ---------- 

432 dimension : `lsst.daf.butler.SkyPixDimension` 

433 Dimension represening a pixelization of the sky. 

434 """ 

435 self._usedSkyPix.add(dimension) 

436 

437 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

438 """Register all skymaps that have been marked as used. 

439 

440 This method is intended to be called primarily by the 

441 `RepoConverter` instances used interally by the task. 

442 

443 Parameters 

444 ---------- 

445 subset : `ConversionSubset`, optional 

446 Object that will be used to filter converted datasets by data ID. 

447 If given, it will be updated with the pixelization IDs that 

448 overlap the visits in the subset. 

449 """ 

450 if subset is not None and self.config.relatedOnly: 

451 for dimension in self._usedSkyPix: 

452 subset.addSkyPix(self.registry, dimension) 

453 

454 def run(self, root: str, *, 

455 calibs: Dict[str, str] = None, 

456 reruns: List[Rerun], 

457 visits: Optional[Iterable[int]] = None, 

458 pool: Optional[Pool] = None, 

459 processes: int = 1): 

460 """Convert a group of related data repositories. 

461 

462 Parameters 

463 ---------- 

464 root : `str` 

465 Complete path to the root Gen2 data repository. This should be 

466 a data repository that includes a Gen2 registry and any raw files 

467 and/or reference catalogs. 

468 calibs : `dict` 

469 Dictionary mapping calibration repository path to the 

470 `~lsst.daf.butler.CollectionType.CALIBRATION` collection that 

471 converted datasets within it should be certified into. 

472 reruns : `list` of `Rerun` 

473 Specifications for rerun (processing output) collections to 

474 convert. 

475 visits : iterable of `int`, optional 

476 The integer IDs of visits to convert. If not provided, all visits 

477 in the Gen2 root repository will be converted. 

478 pool : `multiprocessing.Pool`, optional 

479 If not `None`, a process pool with which to parallelize some 

480 operations. 

481 processes : `int`, optional 

482 The number of processes to use for conversion. 

483 """ 

484 if pool is None and processes > 1: 

485 pool = Pool(processes) 

486 if calibs is None: 

487 calibs = {} 

488 if visits is not None: 

489 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

490 else: 

491 if self.config.relatedOnly: 

492 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

493 "no filtering will be done.") 

494 subset = None 

495 

496 # Make converters for all Gen2 repos. 

497 converters = [] 

498 rootConverter = RootRepoConverter(task=self, root=root, subset=subset, instrument=self.instrument) 

499 converters.append(rootConverter) 

500 for calibRoot, collection in calibs.items(): 

501 if not os.path.isabs(calibRoot): 

502 calibRoot = os.path.join(rootConverter.root, calibRoot) 

503 converter = CalibRepoConverter(task=self, root=calibRoot, collection=collection, 

504 instrument=self.instrument, 

505 mapper=rootConverter.mapper, 

506 subset=rootConverter.subset) 

507 converters.append(converter) 

508 rerunConverters = {} 

509 for spec in reruns: 

510 runRoot = spec.path 

511 if not os.path.isabs(runRoot): 

512 runRoot = os.path.join(rootConverter.root, runRoot) 

513 converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName, 

514 instrument=self.instrument, subset=rootConverter.subset) 

515 converters.append(converter) 

516 rerunConverters[spec.runName] = converter 

517 

518 # Register the instrument if we're configured to do so. 

519 if self.config.doRegisterInstrument: 

520 self.instrument.register(self.registry) 

521 

522 # Run raw ingest (does nothing if we weren't configured to convert the 

523 # 'raw' dataset type). 

524 rootConverter.runRawIngest(pool=pool) 

525 

526 # Write curated calibrations to all calibration runs and 

527 # also in the default collection. 

528 # Add new collections to the list of collections the butler was 

529 # initialized to pass to DefineVisitsTask, to deal with the (likely) 

530 # case the only 'camera' dataset in the repo will be one we're adding 

531 # here. 

532 if self.config.doWriteCuratedCalibrations: 

533 butler3 = Butler3(butler=self.butler3) 

534 # Write curated calibrations to any new calibration collections we 

535 # created by converting a Gen2 calibration repo. 

536 calibCollections = set() 

537 for collection in calibs.values(): 

538 self.instrument.writeCuratedCalibrations(butler3, collection=collection) 

539 calibCollections.add(collection) 

540 # Ensure that we have the curated calibrations even if there 

541 # is no calibration conversion. It's possible that the default 

542 # calib collection will have been specified (in fact the 

543 # butler convert script enforces that behavior for now) so 

544 # we check for the default situation 

545 # Assume we know the default rather than letting 

546 # writeCuratedCalibrations default itself 

547 defaultCalibCollection = self.instrument.makeCollectionName("calib") 

548 if defaultCalibCollection not in calibCollections: 

549 self.instrument.writeCuratedCalibrations(butler3, collection=defaultCalibCollection) 

550 

551 # Define visits (also does nothing if we weren't configurd to convert 

552 # the 'raw' dataset type). 

553 rootConverter.runDefineVisits(pool=pool) 

554 

555 # Walk Gen2 repos to find datasets convert. 

556 for converter in converters: 

557 converter.prep() 

558 

559 # Insert dimensions that are potentially shared by all Gen2 

560 # repositories (and are hence managed directly by the Task, rather 

561 # than a converter instance). 

562 # This also finishes setting up the (shared) converter.subsets object 

563 # that is used to filter data IDs for config.relatedOnly. 

564 self.registerUsedSkyMaps(rootConverter.subset) 

565 self.registerUsedSkyPix(rootConverter.subset) 

566 

567 # Look for datasets, generally by scanning the filesystem. 

568 # This requires dimensions to have already been inserted so we can use 

569 # dimension information to identify related datasets. 

570 for converter in converters: 

571 converter.findDatasets() 

572 

573 # Expand data IDs. 

574 for converter in converters: 

575 converter.expandDataIds() 

576 

577 # Actually ingest datasets. 

578 for converter in converters: 

579 converter.ingest() 

580 

581 # Perform any post-ingest processing. 

582 for converter in converters: 

583 converter.finish() 

584 

585 # Add chained collections for reruns. 

586 for spec in reruns: 

587 if spec.chainName is not None: 

588 self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED) 

589 chain = [spec.runName] 

590 chain.extend(rerunConverters[spec.runName].getCollectionChain()) 

591 for parent in spec.parents: 

592 chain.append(parent) 

593 parentConverter = rerunConverters.get(parent) 

594 if parentConverter is not None: 

595 chain.extend(parentConverter.getCollectionChain()) 

596 chain.extend(rootConverter.getCollectionChain()) 

597 self.log.info("Defining %s from chain %s.", spec.chainName, chain) 

598 self.butler3.registry.setCollectionChain(spec.chainName, chain)