Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from multiprocessing import Pool 

29from typing import Iterable, Optional, List, Dict 

30 

31from lsst.daf.butler import ( 

32 Butler as Butler3, 

33 CollectionType, 

34 SkyPixDimension 

35) 

36from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

37from lsst.pipe.base import Task 

38from lsst.skymap import skyMapRegistry, BaseSkyMap 

39 

40from ..ingest import RawIngestTask 

41from ..defineVisits import DefineVisitsTask 

42from .repoConverter import ConversionSubset 

43from .rootRepoConverter import RootRepoConverter 

44from .calibRepoConverter import CalibRepoConverter 

45from .standardRepoConverter import StandardRepoConverter 

46from .._instrument import Instrument 

47 

48 

49@dataclass 

50class ConfiguredSkyMap: 

51 """Struct containing information about a skymap that may appear in a Gen2 

52 repository. 

53 """ 

54 

55 name: str 

56 """Name of the skymap used in Gen3 data IDs. 

57 """ 

58 

59 sha1: bytes 

60 """Hash computed by `BaseSkyMap.getSha1`. 

61 """ 

62 

63 instance: BaseSkyMap 

64 """Name of the skymap used in Gen3 data IDs. 

65 """ 

66 

67 used: bool = False 

68 """Whether this skymap has been found in at least one repository being 

69 converted. 

70 """ 

71 

72 

73@dataclass 

74class Rerun: 

75 """Specification for a Gen2 processing-output repository to convert. 

76 """ 

77 

78 path: str 

79 """Absolute or relative (to the root repository) path to the Gen2 

80 repository (`str`). 

81 """ 

82 

83 runName: str 

84 """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets 

85 will be inserted into (`str`). 

86 """ 

87 

88 chainName: Optional[str] 

89 """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will 

90 combine this repository's datasets with those of its parent repositories 

91 (`str`, optional). 

92 """ 

93 

94 parents: List[str] 

95 """Collection names associated with parent repositories, used to define the 

96 chained collection (`list` [ `str` ]). 

97 

98 Ignored if `chainName` is `None`. Runs used in the root repo are 

99 automatically included. 

100 """ 

101 

102 

103class ConvertRepoSkyMapConfig(Config): 

104 """Sub-config used to hold the parameters of a SkyMap. 

105 

106 Notes 

107 ----- 

108 This config only needs to exist because we can't put a 

109 `~lsst.pex.config.RegistryField` directly inside a 

110 `~lsst.pex.config.ConfigDictField`. 

111 

112 It needs to have its only field named "skyMap" for compatibility with the 

113 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

114 use one config file in an obs package to configure both. 

115 

116 This name leads to unfortunate repetition with the field named 

117 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

118 unavoidable. 

119 """ 

120 skyMap = skyMapRegistry.makeField( 

121 doc="Type and parameters for the SkyMap itself.", 

122 default="dodeca", 

123 ) 

124 

125 

126class ConvertRepoConfig(Config): 

127 raws = ConfigurableField( 

128 "Configuration for subtask responsible for ingesting raws and adding " 

129 "exposure dimension entries.", 

130 target=RawIngestTask, 

131 ) 

132 defineVisits = ConfigurableField( 

133 "Configuration for the subtask responsible for defining visits from " 

134 "exposures.", 

135 target=DefineVisitsTask, 

136 ) 

137 skyMaps = ConfigDictField( 

138 "Mapping from Gen3 skymap name to the parameters used to construct a " 

139 "BaseSkyMap instance. This will be used to associate names with " 

140 "existing skymaps found in the Gen2 repo.", 

141 keytype=str, 

142 itemtype=ConvertRepoSkyMapConfig, 

143 default={} 

144 ) 

145 rootSkyMapName = Field( 

146 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

147 "datasets in the root repository when no SkyMap is found there. ", 

148 dtype=str, 

149 optional=True, 

150 default=None, 

151 ) 

152 runs = DictField( 

153 "A mapping from dataset type name to the RUN collection they should " 

154 "be inserted into. This must include all datasets that can be found " 

155 "in the root repository; other repositories will use per-repository " 

156 "runs.", 

157 keytype=str, 

158 itemtype=str, 

159 default={ 

160 "deepCoadd_skyMap": "skymaps", 

161 "brightObjectMask": "masks", 

162 } 

163 ) 

164 storageClasses = DictField( 

165 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

166 "or 'persistable') to the Gen3 StorageClass name.", 

167 keytype=str, 

168 itemtype=str, 

169 default={ 

170 "bias": "ExposureF", 

171 "dark": "ExposureF", 

172 "flat": "ExposureF", 

173 "defects": "Defects", 

174 "crosstalk": "CrosstalkCalib", 

175 "BaseSkyMap": "SkyMap", 

176 "BaseCatalog": "Catalog", 

177 "BackgroundList": "Background", 

178 "raw": "Exposure", 

179 "MultilevelParquetTable": "DataFrame", 

180 "ParquetTable": "DataFrame", 

181 "SkyWcs": "Wcs", 

182 } 

183 ) 

184 formatterClasses = DictField( 

185 "Mapping from dataset type name to formatter class. " 

186 "By default these are derived from the formatters listed in the" 

187 " Gen3 datastore configuration.", 

188 keytype=str, 

189 itemtype=str, 

190 default={} 

191 ) 

192 targetHandlerClasses = DictField( 

193 "Mapping from dataset type name to target handler class.", 

194 keytype=str, 

195 itemtype=str, 

196 default={} 

197 ) 

198 doRegisterInstrument = Field( 

199 "If True (default), add dimension records for the Instrument and its " 

200 "filters and detectors to the registry instead of assuming they are " 

201 "already present.", 

202 dtype=bool, 

203 default=True, 

204 ) 

205 doWriteCuratedCalibrations = Field( 

206 "If True (default), ingest human-curated calibrations directly via " 

207 "the Instrument interface. Note that these calibrations are never " 

208 "converted from Gen2 repositories.", 

209 dtype=bool, 

210 default=True, 

211 ) 

212 refCats = ListField( 

213 "The names of reference catalogs (subdirectories under ref_cats) to " 

214 "be converted", 

215 dtype=str, 

216 default=[] 

217 ) 

218 fileIgnorePatterns = ListField( 

219 "Filename globs that should be ignored instead of being treated as " 

220 "datasets.", 

221 dtype=str, 

222 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

223 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

224 "_parent", "repositoryCfg.yaml"] 

225 ) 

226 rawDatasetType = Field( 

227 "Gen2 dataset type to use for raw data.", 

228 dtype=str, 

229 default="raw", 

230 ) 

231 datasetIncludePatterns = ListField( 

232 "Glob-style patterns for dataset type names that should be converted.", 

233 dtype=str, 

234 default=["*"] 

235 ) 

236 datasetIgnorePatterns = ListField( 

237 "Glob-style patterns for dataset type names that should not be " 

238 "converted despite matching a pattern in datasetIncludePatterns.", 

239 dtype=str, 

240 default=[] 

241 ) 

242 ccdKey = Field( 

243 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

244 dtype=str, 

245 default="ccd", 

246 ) 

247 relatedOnly = Field( 

248 "If True (default), only convert datasets that are related to the " 

249 "ingested visits. Ignored unless a list of visits is passed to " 

250 "run().", 

251 dtype=bool, 

252 default=False, 

253 ) 

254 

255 @property 

256 def transfer(self): 

257 return self.raws.transfer 

258 

259 @transfer.setter 

260 def transfer(self, value): 

261 self.raws.transfer = value 

262 

263 def setDefaults(self): 

264 self.transfer = None 

265 

266 # TODO: check that there are no collection overrides for curated 

267 # calibrations, since we don't have a good way to utilize them. 

268 

269 

270class ConvertRepoTask(Task): 

271 """A task that converts one or more related Gen2 data repositories to a 

272 single Gen3 data repository (with multiple collections). 

273 

274 Parameters 

275 ---------- 

276 config: `ConvertRepoConfig` 

277 Configuration for this task. 

278 butler3: `lsst.daf.butler.Butler` 

279 A writeable Gen3 Butler instance that represents the data repository 

280 that datasets will be ingested into. If the 'raw' dataset is 

281 configured to be included in the conversion, ``butler3.run`` should be 

282 set to the name of the collection raws should be ingested into, and 

283 ``butler3.collections`` should include a calibration collection from 

284 which the ``camera`` dataset can be loaded, unless a calibration repo 

285 is converted and ``doWriteCuratedCalibrations`` is `True`. 

286 instrument : `lsst.obs.base.Instrument` 

287 The Gen3 instrument that should be used for this conversion. 

288 **kwargs 

289 Other keyword arguments are forwarded to the `Task` constructor. 

290 

291 Notes 

292 ----- 

293 Most of the work of converting repositories is delegated to instances of 

294 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

295 only state that is relevant for all Gen2 repositories being ingested, while 

296 each `RepoConverter` instance holds only state relevant for the conversion 

297 of a single Gen2 repository. Both the task and the `RepoConverter` 

298 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

299 methods may only be called once on a particular instance. 

300 """ 

301 

302 ConfigClass = ConvertRepoConfig 

303 

304 _DefaultName = "convertRepo" 

305 

306 def __init__(self, config=None, *, butler3: Butler3, instrument: Instrument, **kwargs): 

307 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

308 super().__init__(config, **kwargs) 

309 self.butler3 = butler3 

310 self.registry = self.butler3.registry 

311 self.universe = self.registry.dimensions 

312 if self.isDatasetTypeIncluded("raw"): 

313 self.makeSubtask("raws", butler=butler3) 

314 self.makeSubtask("defineVisits", butler=butler3) 

315 else: 

316 self.raws = None 

317 self.defineVisits = None 

318 self.instrument = instrument 

319 self._configuredSkyMapsBySha1 = {} 

320 self._configuredSkyMapsByName = {} 

321 for name, config in self.config.skyMaps.items(): 

322 instance = config.skyMap.apply() 

323 self._populateSkyMapDicts(name, instance) 

324 self._usedSkyPix = set() 

325 self.translatorFactory = self.instrument.makeDataIdTranslatorFactory() 

326 self.translatorFactory.log = self.log.getChild("translators") 

327 

328 def _reduce_kwargs(self): 

329 # Add extra parameters to pickle 

330 return dict(**super()._reduce_kwargs(), butler3=self.butler3, instrument=self.instrument) 

331 

332 def _populateSkyMapDicts(self, name, instance): 

333 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

334 self._configuredSkyMapsBySha1[struct.sha1] = struct 

335 self._configuredSkyMapsByName[struct.name] = struct 

336 

337 def isDatasetTypeIncluded(self, datasetTypeName: str): 

338 """Return `True` if configuration indicates that the given dataset type 

339 should be converted. 

340 

341 This method is intended to be called primarily by the 

342 `RepoConverter` instances used interally by the task. 

343 

344 Parameters 

345 ---------- 

346 datasetTypeName: str 

347 Name of the dataset type. 

348 

349 Returns 

350 ------- 

351 included : `bool` 

352 Whether the dataset should be included in the conversion. 

353 """ 

354 return ( 

355 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

356 for pattern in self.config.datasetIncludePatterns) 

357 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

358 for pattern in self.config.datasetIgnorePatterns) 

359 ) 

360 

361 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str: 

362 """Indicate that a repository uses the given SkyMap. 

363 

364 This method is intended to be called primarily by the 

365 `RepoConverter` instances used interally by the task. 

366 

367 Parameters 

368 ---------- 

369 skyMap : `lsst.skymap.BaseSkyMap` 

370 SkyMap instance being used, typically retrieved from a Gen2 

371 data repository. 

372 skyMapName : `str` 

373 The name of the gen2 skymap, for error reporting. 

374 

375 Returns 

376 ------- 

377 name : `str` 

378 The name of the skymap in Gen3 data IDs. 

379 

380 Raises 

381 ------ 

382 LookupError 

383 Raised if the specified skymap cannot be found. 

384 """ 

385 sha1 = skyMap.getSha1() 

386 if sha1 not in self._configuredSkyMapsBySha1: 

387 self._populateSkyMapDicts(skyMapName, skyMap) 

388 try: 

389 struct = self._configuredSkyMapsBySha1[sha1] 

390 except KeyError as err: 

391 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration." 

392 raise LookupError(msg) from err 

393 struct.used = True 

394 return struct.name 

395 

396 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

397 """Register all skymaps that have been marked as used. 

398 

399 This method is intended to be called primarily by the 

400 `RepoConverter` instances used interally by the task. 

401 

402 Parameters 

403 ---------- 

404 subset : `ConversionSubset`, optional 

405 Object that will be used to filter converted datasets by data ID. 

406 If given, it will be updated with the tracts of this skymap that 

407 overlap the visits in the subset. 

408 """ 

409 for struct in self._configuredSkyMapsBySha1.values(): 

410 if struct.used: 

411 struct.instance.register(struct.name, self.registry) 

412 if subset is not None and self.config.relatedOnly: 

413 subset.addSkyMap(self.registry, struct.name) 

414 

415 def useSkyPix(self, dimension: SkyPixDimension): 

416 """Indicate that a repository uses the given SkyPix dimension. 

417 

418 This method is intended to be called primarily by the 

419 `RepoConverter` instances used interally by the task. 

420 

421 Parameters 

422 ---------- 

423 dimension : `lsst.daf.butler.SkyPixDimension` 

424 Dimension represening a pixelization of the sky. 

425 """ 

426 self._usedSkyPix.add(dimension) 

427 

428 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

429 """Register all skymaps that have been marked as used. 

430 

431 This method is intended to be called primarily by the 

432 `RepoConverter` instances used interally by the task. 

433 

434 Parameters 

435 ---------- 

436 subset : `ConversionSubset`, optional 

437 Object that will be used to filter converted datasets by data ID. 

438 If given, it will be updated with the pixelization IDs that 

439 overlap the visits in the subset. 

440 """ 

441 if subset is not None and self.config.relatedOnly: 

442 for dimension in self._usedSkyPix: 

443 subset.addSkyPix(self.registry, dimension) 

444 

445 def run(self, root: str, *, 

446 calibs: Dict[str, str] = None, 

447 reruns: List[Rerun], 

448 visits: Optional[Iterable[int]] = None, 

449 pool: Optional[Pool] = None, 

450 processes: int = 1): 

451 """Convert a group of related data repositories. 

452 

453 Parameters 

454 ---------- 

455 root : `str` 

456 Complete path to the root Gen2 data repository. This should be 

457 a data repository that includes a Gen2 registry and any raw files 

458 and/or reference catalogs. 

459 calibs : `dict` 

460 Dictionary mapping calibration repository path to the 

461 `~lsst.daf.butler.CollectionType.CALIBRATION` collection that 

462 converted datasets within it should be certified into. 

463 reruns : `list` of `Rerun` 

464 Specifications for rerun (processing output) collections to 

465 convert. 

466 visits : iterable of `int`, optional 

467 The integer IDs of visits to convert. If not provided, all visits 

468 in the Gen2 root repository will be converted. 

469 pool : `multiprocessing.Pool`, optional 

470 If not `None`, a process pool with which to parallelize some 

471 operations. 

472 processes : `int`, optional 

473 The number of processes to use for conversion. 

474 """ 

475 if pool is None and processes > 1: 

476 pool = Pool(processes) 

477 if calibs is None: 

478 calibs = {} 

479 if visits is not None: 

480 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

481 else: 

482 if self.config.relatedOnly: 

483 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

484 "no filtering will be done.") 

485 subset = None 

486 

487 # Make converters for all Gen2 repos. 

488 converters = [] 

489 rootConverter = RootRepoConverter(task=self, root=root, subset=subset, instrument=self.instrument) 

490 converters.append(rootConverter) 

491 for calibRoot, collection in calibs.items(): 

492 if not os.path.isabs(calibRoot): 

493 calibRoot = os.path.join(rootConverter.root, calibRoot) 

494 converter = CalibRepoConverter(task=self, root=calibRoot, collection=collection, 

495 instrument=self.instrument, 

496 mapper=rootConverter.mapper, 

497 subset=rootConverter.subset) 

498 converters.append(converter) 

499 rerunConverters = {} 

500 for spec in reruns: 

501 runRoot = spec.path 

502 if not os.path.isabs(runRoot): 

503 runRoot = os.path.join(rootConverter.root, runRoot) 

504 converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName, 

505 instrument=self.instrument, subset=rootConverter.subset) 

506 converters.append(converter) 

507 rerunConverters[spec.runName] = converter 

508 

509 # Register the instrument if we're configured to do so. 

510 if self.config.doRegisterInstrument: 

511 self.instrument.register(self.registry) 

512 

513 # Run raw ingest (does nothing if we weren't configured to convert the 

514 # 'raw' dataset type). 

515 rootConverter.runRawIngest(pool=pool) 

516 

517 # Write curated calibrations to all calibration runs and 

518 # also in the default collection. 

519 # Add new collections to the list of collections the butler was 

520 # initialized to pass to DefineVisitsTask, to deal with the (likely) 

521 # case the only 'camera' dataset in the repo will be one we're adding 

522 # here. 

523 if self.config.doWriteCuratedCalibrations: 

524 butler3 = Butler3(butler=self.butler3) 

525 # Write curated calibrations to any new calibration collections we 

526 # created by converting a Gen2 calibration repo. 

527 calibCollections = set() 

528 for collection in calibs.values(): 

529 self.instrument.writeCuratedCalibrations(butler3, collection=collection) 

530 calibCollections.add(collection) 

531 # Ensure that we have the curated calibrations even if there 

532 # is no calibration conversion. It's possible that the default 

533 # calib collection will have been specified (in fact the 

534 # butler convert script enforces that behavior for now) so 

535 # we check for the default situation 

536 # Assume we know the default rather than letting 

537 # writeCuratedCalibrations default itself 

538 defaultCalibCollection = self.instrument.makeCollectionName("calib") 

539 if defaultCalibCollection not in calibCollections: 

540 self.instrument.writeCuratedCalibrations(butler3, collection=defaultCalibCollection) 

541 

542 # Define visits (also does nothing if we weren't configurd to convert 

543 # the 'raw' dataset type). 

544 rootConverter.runDefineVisits(pool=pool) 

545 

546 # Walk Gen2 repos to find datasets convert. 

547 for converter in converters: 

548 converter.prep() 

549 

550 # Insert dimensions that are potentially shared by all Gen2 

551 # repositories (and are hence managed directly by the Task, rather 

552 # than a converter instance). 

553 # This also finishes setting up the (shared) converter.subsets object 

554 # that is used to filter data IDs for config.relatedOnly. 

555 self.registerUsedSkyMaps(rootConverter.subset) 

556 self.registerUsedSkyPix(rootConverter.subset) 

557 

558 # Look for datasets, generally by scanning the filesystem. 

559 # This requires dimensions to have already been inserted so we can use 

560 # dimension information to identify related datasets. 

561 for converter in converters: 

562 converter.findDatasets() 

563 

564 # Expand data IDs. 

565 for converter in converters: 

566 converter.expandDataIds() 

567 

568 # Actually ingest datasets. 

569 for converter in converters: 

570 converter.ingest() 

571 

572 # Perform any post-ingest processing. 

573 for converter in converters: 

574 converter.finish() 

575 

576 # Add chained collections for reruns. 

577 for spec in reruns: 

578 if spec.chainName is not None: 

579 self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED) 

580 chain = [spec.runName] 

581 chain.extend(rerunConverters[spec.runName].getCollectionChain()) 

582 for parent in spec.parents: 

583 chain.append(spec.parent) 

584 parentConverter = rerunConverters.get(parent) 

585 if parentConverter is not None: 

586 chain.extend(parentConverter.getCollectionChain()) 

587 chain.extend(rootConverter.getCollectionChain()) 

588 self.log.info("Defining %s from chain %s.", spec.chainName, chain) 

589 self.butler3.registry.setCollectionChain(spec.chainName, chain)