Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from typing import Iterable, Optional, List, Dict 

29 

30from lsst.utils import doImport 

31from lsst.daf.butler import ( 

32 Butler as Butler3, 

33 SkyPixDimension 

34) 

35from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

36from lsst.pipe.base import Task 

37from lsst.skymap import skyMapRegistry, BaseSkyMap 

38 

39from ..ingest import RawIngestTask 

40from .repoConverter import ConversionSubset 

41from .rootRepoConverter import RootRepoConverter 

42from .calibRepoConverter import CalibRepoConverter 

43from .standardRepoConverter import StandardRepoConverter 

44 

45 

46@dataclass 

47class ConfiguredSkyMap: 

48 """Struct containing information about a skymap that may appear in a Gen2 

49 repository. 

50 """ 

51 

52 name: str 

53 """Name of the skymap used in Gen3 data IDs. 

54 """ 

55 

56 sha1: bytes 

57 """Hash computed by `BaseSkyMap.getSha1`. 

58 """ 

59 

60 instance: BaseSkyMap 

61 """Name of the skymap used in Gen3 data IDs. 

62 """ 

63 

64 used: bool = False 

65 """Whether this skymap has been found in at least one repository being 

66 converted. 

67 """ 

68 

69 

70class ConvertRepoSkyMapConfig(Config): 

71 """Sub-config used to hold the parameters of a SkyMap. 

72 

73 Notes 

74 ----- 

75 This config only needs to exist because we can't put a 

76 `~lsst.pex.config.RegistryField` directly inside a 

77 `~lsst.pex.config.ConfigDictField`. 

78 

79 It needs to have its only field named "skyMap" for compatibility with the 

80 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

81 use one config file in an obs package to configure both. 

82 

83 This name leads to unfortunate repetition with the field named 

84 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

85 unavoidable. 

86 """ 

87 skyMap = skyMapRegistry.makeField( 

88 doc="Type and parameters for the SkyMap itself.", 

89 default="dodeca", 

90 ) 

91 

92 

93class ConvertRepoConfig(Config): 

94 raws = ConfigurableField( 

95 "Configuration for subtask responsible for ingesting raws and adding " 

96 "visit and exposure dimension entries.", 

97 target=RawIngestTask, 

98 ) 

99 skyMaps = ConfigDictField( 

100 "Mapping from Gen3 skymap name to the parameters used to construct a " 

101 "BaseSkyMap instance. This will be used to associate names with " 

102 "existing skymaps found in the Gen2 repo.", 

103 keytype=str, 

104 itemtype=ConvertRepoSkyMapConfig, 

105 default={} 

106 ) 

107 rootSkyMapName = Field( 

108 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

109 "datasets in the root repository when no SkyMap is found there. ", 

110 dtype=str, 

111 optional=True, 

112 default=None, 

113 ) 

114 collections = DictField( 

115 "Special collections (values) for certain dataset types (keys). " 

116 "These are used in addition to rerun collections for datasets in " 

117 "reruns. The 'raw' dataset must have an entry here if it is to be " 

118 "converted.", 

119 keytype=str, 

120 itemtype=str, 

121 default={ 

122 "deepCoadd_skyMap": "skymaps", 

123 "brightObjectMask": "masks", 

124 } 

125 ) 

126 storageClasses = DictField( 

127 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

128 "or 'persistable') to the Gen3 StorageClass name.", 

129 keytype=str, 

130 itemtype=str, 

131 default={ 

132 "bias": "ExposureF", 

133 "dark": "ExposureF", 

134 "flat": "ExposureF", 

135 "defects": "Defects", 

136 "BaseSkyMap": "SkyMap", 

137 "BaseCatalog": "Catalog", 

138 "BackgroundList": "Background", 

139 "raw": "Exposure", 

140 "MultilevelParquetTable": "DataFrame", 

141 "ParquetTable": "DataFrame", 

142 "SkyWcs": "Wcs", 

143 } 

144 ) 

145 formatterClasses = DictField( 

146 "Mapping from dataset type name to formatter class. " 

147 "By default these are derived from the formatters listed in the" 

148 " Gen3 datastore configuration.", 

149 keytype=str, 

150 itemtype=str, 

151 default={} 

152 ) 

153 targetHandlerClasses = DictField( 

154 "Mapping from dataset type name to target handler class.", 

155 keytype=str, 

156 itemtype=str, 

157 default={} 

158 ) 

159 doRegisterInstrument = Field( 

160 "If True (default), add dimension records for the Instrument and its " 

161 "filters and detectors to the registry instead of assuming they are " 

162 "already present.", 

163 dtype=bool, 

164 default=True, 

165 ) 

166 doWriteCuratedCalibrations = Field( 

167 "If True (default), ingest human-curated calibrations directly via " 

168 "the Instrument interface. Note that these calibrations are never " 

169 "converted from Gen2 repositories.", 

170 dtype=bool, 

171 default=True, 

172 ) 

173 refCats = ListField( 

174 "The names of reference catalogs (subdirectories under ref_cats) to " 

175 "be converted", 

176 dtype=str, 

177 default=[] 

178 ) 

179 fileIgnorePatterns = ListField( 

180 "Filename globs that should be ignored instead of being treated as " 

181 "datasets.", 

182 dtype=str, 

183 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

184 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

185 "_parent", "repositoryCfg.yaml"] 

186 ) 

187 datasetIncludePatterns = ListField( 

188 "Glob-style patterns for dataset type names that should be converted.", 

189 dtype=str, 

190 default=["*"] 

191 ) 

192 datasetIgnorePatterns = ListField( 

193 "Glob-style patterns for dataset type names that should not be " 

194 "converted despite matching a pattern in datasetIncludePatterns.", 

195 dtype=str, 

196 default=[] 

197 ) 

198 ccdKey = Field( 

199 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

200 dtype=str, 

201 default="ccd", 

202 ) 

203 relatedOnly = Field( 

204 "If True (default), only convert datasets that are related to the " 

205 "ingested visits. Ignored unless a list of visits is passed to " 

206 "run().", 

207 dtype=bool, 

208 default=False, 

209 ) 

210 curatedCalibrations = ListField( 

211 "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` " 

212 "and thus should not be converted using the standard calibration " 

213 "conversion system.", 

214 dtype=str, 

215 default=["camera", 

216 "transmission_sensor", 

217 "transmission_filter", 

218 "transmission_optics", 

219 "transmission_atmosphere", 

220 "bfKernel"] 

221 ) 

222 

223 @property 

224 def transfer(self): 

225 return self.raws.transfer 

226 

227 @transfer.setter 

228 def transfer(self, value): 

229 self.raws.transfer = value 

230 

231 @property 

232 def instrument(self): 

233 return self.raws.instrument 

234 

235 @instrument.setter 

236 def instrument(self, value): 

237 self.raws.instrument = value 

238 

239 def setDefaults(self): 

240 self.transfer = None 

241 

242 # TODO: check that there are no collection overrides for curated 

243 # calibrations, since we don't have a good way to utilize them. 

244 

245 

246class ConvertRepoTask(Task): 

247 """A task that converts one or more related Gen2 data repositories to a 

248 single Gen3 data repository (with multiple collections). 

249 

250 Parameters 

251 ---------- 

252 config: `ConvertRepoConfig` 

253 Configuration for this task. 

254 butler3: `lsst.daf.butler.Butler` 

255 Gen3 Butler instance that represents the data repository datasets will 

256 be ingested into. The collection and/or run associated with this 

257 Butler will be ignored in favor of collections/runs passed via config 

258 or to `run`. 

259 kwds 

260 Other keyword arguments are forwarded to the `Task` constructor. 

261 

262 Notes 

263 ----- 

264 Most of the work of converting repositories is delegated to instances of 

265 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

266 only state that is relevant for all Gen2 repositories being ingested, while 

267 each `RepoConverter` instance holds only state relevant for the conversion 

268 of a single Gen2 repository. Both the task and the `RepoConverter` 

269 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

270 methods may only be called once on a particular instance. 

271 """ 

272 

273 ConfigClass = ConvertRepoConfig 

274 

275 _DefaultName = "convertRepo" 

276 

277 def __init__(self, config=None, *, butler3: Butler3, **kwds): 

278 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

279 super().__init__(config, **kwds) 

280 self.butler3 = butler3 

281 self.registry = self.butler3.registry 

282 self.universe = self.registry.dimensions 

283 if self.isDatasetTypeIncluded("raw"): 

284 self.makeSubtask("raws", butler=butler3) 

285 self.instrument = self.raws.instrument 

286 else: 

287 self.raws = None 

288 self.instrument = doImport(self.config.instrument)() 

289 self._configuredSkyMapsBySha1 = {} 

290 self._configuredSkyMapsByName = {} 

291 for name, config in self.config.skyMaps.items(): 

292 instance = config.skyMap.apply() 

293 self._populateSkyMapDicts(name, instance) 

294 self._usedSkyPix = set() 

295 

296 def _populateSkyMapDicts(self, name, instance): 

297 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

298 self._configuredSkyMapsBySha1[struct.sha1] = struct 

299 self._configuredSkyMapsByName[struct.name] = struct 

300 

301 def isDatasetTypeIncluded(self, datasetTypeName: str): 

302 """Return `True` if configuration indicates that the given dataset type 

303 should be converted. 

304 

305 This method is intended to be called primarily by the 

306 `RepoConverter` instances used interally by the task. 

307 

308 Parameters 

309 ---------- 

310 datasetTypeName: str 

311 Name of the dataset type. 

312 

313 Returns 

314 ------- 

315 included : `bool` 

316 Whether the dataset should be included in the conversion. 

317 """ 

318 return ( 

319 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

320 for pattern in self.config.datasetIncludePatterns) 

321 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

322 for pattern in self.config.datasetIgnorePatterns) 

323 ) 

324 

325 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str: 

326 """Indicate that a repository uses the given SkyMap. 

327 

328 This method is intended to be called primarily by the 

329 `RepoConverter` instances used interally by the task. 

330 

331 Parameters 

332 ---------- 

333 skyMap : `lsst.skymap.BaseSkyMap` 

334 SkyMap instance being used, typically retrieved from a Gen2 

335 data repository. 

336 skyMapName : `str` 

337 The name of the gen2 skymap, for error reporting. 

338 

339 Returns 

340 ------- 

341 name : `str` 

342 The name of the skymap in Gen3 data IDs. 

343 

344 Raises 

345 ------ 

346 LookupError 

347 Raised if the specified skymap cannot be found. 

348 """ 

349 sha1 = skyMap.getSha1() 

350 if sha1 not in self._configuredSkyMapsBySha1: 

351 self._populateSkyMapDicts(skyMapName, skyMap) 

352 try: 

353 struct = self._configuredSkyMapsBySha1[sha1] 

354 except KeyError as err: 

355 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration." 

356 raise LookupError(msg) from err 

357 struct.used = True 

358 return struct.name 

359 

360 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

361 """Register all skymaps that have been marked as used. 

362 

363 This method is intended to be called primarily by the 

364 `RepoConverter` instances used interally by the task. 

365 

366 Parameters 

367 ---------- 

368 subset : `ConversionSubset`, optional 

369 Object that will be used to filter converted datasets by data ID. 

370 If given, it will be updated with the tracts of this skymap that 

371 overlap the visits in the subset. 

372 """ 

373 for struct in self._configuredSkyMapsBySha1.values(): 

374 if struct.used: 

375 struct.instance.register(struct.name, self.registry) 

376 if subset is not None and self.config.relatedOnly: 

377 subset.addSkyMap(self.registry, struct.name) 

378 

379 def useSkyPix(self, dimension: SkyPixDimension): 

380 """Indicate that a repository uses the given SkyPix dimension. 

381 

382 This method is intended to be called primarily by the 

383 `RepoConverter` instances used interally by the task. 

384 

385 Parameters 

386 ---------- 

387 dimension : `lsst.daf.butler.SkyPixDimension` 

388 Dimension represening a pixelization of the sky. 

389 """ 

390 self._usedSkyPix.add(dimension) 

391 

392 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

393 """Register all skymaps that have been marked as used. 

394 

395 This method is intended to be called primarily by the 

396 `RepoConverter` instances used interally by the task. 

397 

398 Parameters 

399 ---------- 

400 subset : `ConversionSubset`, optional 

401 Object that will be used to filter converted datasets by data ID. 

402 If given, it will be updated with the pixelization IDs that 

403 overlap the visits in the subset. 

404 """ 

405 if subset is not None and self.config.relatedOnly: 

406 for dimension in self._usedSkyPix: 

407 subset.addSkyPix(self.registry, dimension) 

408 

409 def run(self, root: str, collections: List[str], *, 

410 calibs: Dict[str, List[str]] = None, 

411 reruns: Dict[str, List[str]] = None, 

412 visits: Optional[Iterable[int]] = None): 

413 """Convert a group of related data repositories. 

414 

415 Parameters 

416 ---------- 

417 root : `str` 

418 Complete path to the root Gen2 data repository. This should be 

419 a data repository that includes a Gen2 registry and any raw files 

420 and/or reference catalogs. 

421 collections : `list` of `str` 

422 Gen3 collections that datasets from the root repository should be 

423 associated with. This should include any rerun collection that 

424 these datasets should also be considered to be part of; because of 

425 structural difference between Gen2 parent/child relationships and 

426 Gen3 collections, these cannot be reliably inferred. 

427 calibs : `dict` 

428 Dictionary mapping calibration repository path to the collections 

429 that the repository's datasets should be associated with. The path 

430 may be relative to ``root`` or absolute. Collections should 

431 include child repository collections as appropriate (see 

432 documentation for ``collections``). 

433 reruns : `dict` 

434 Dictionary mapping rerun repository path to the collections that 

435 the repository's datasets should be associated with. The path may 

436 be relative to ``root`` or absolute. Collections should include 

437 child repository collections as appropriate (see documentation for 

438 ``collections``). 

439 visits : iterable of `int`, optional 

440 The integer IDs of visits to convert. If not provided, all visits 

441 in the Gen2 root repository will be converted. 

442 """ 

443 

444 if calibs is None: 

445 calibs = {} 

446 if reruns is None: 

447 reruns = {} 

448 if visits is not None: 

449 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

450 else: 

451 if self.config.relatedOnly: 

452 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

453 "no filtering will be done.") 

454 subset = None 

455 

456 # We can't wrap database writes sanely in transactions (yet) because we 

457 # keep initializing new Butler instances just so we can write into new 

458 # runs/collections, and transactions are managed at the Butler level. 

459 # DM-21246 should let us fix this, assuming we actually want to keep 

460 # the transaction open that long. 

461 if self.config.doRegisterInstrument: 

462 self.instrument.register(self.registry) 

463 

464 # Make and prep converters for all Gen2 repos. This should not modify 

465 # the Registry database or filesystem at all, though it may query it. 

466 # The prep() calls here will be some of the slowest ones, because 

467 # that's when we walk the filesystem. 

468 converters = [] 

469 rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset) 

470 rootConverter.prep() 

471 converters.append(rootConverter) 

472 

473 for root, collections in calibs.items(): 

474 if not os.path.isabs(root): 

475 root = os.path.join(rootConverter.root, root) 

476 converter = CalibRepoConverter(task=self, root=root, collections=collections, 

477 mapper=rootConverter.mapper, 

478 subset=rootConverter.subset) 

479 converter.prep() 

480 converters.append(converter) 

481 

482 for root, collections in reruns.items(): 

483 if not os.path.isabs(root): 

484 root = os.path.join(rootConverter.root, root) 

485 converter = StandardRepoConverter(task=self, root=root, collections=collections, 

486 subset=rootConverter.subset) 

487 converter.prep() 

488 converters.append(converter) 

489 

490 # Actual database writes start here. We can't wrap these sanely in 

491 # transactions (yet) because we keep initializing new Butler instances 

492 # just so we can write into new runs/collections, and transactions 

493 # are managed at the Butler level (DM-21246 should let us fix this). 

494 

495 # Insert dimensions needed by any converters. These are only the 

496 # dimensions that a converter expects to be uniquely derived from the 

497 # Gen2 repository it is reponsible for - e.g. visits, exposures, and 

498 # calibration_labels. 

499 # 

500 # Note that we do not try to filter dimensions down to just those 

501 # related to the given visits, even if config.relatedOnly is True; we 

502 # need them in the Gen3 repo in order to be able to know which datasets 

503 # to convert, because Gen2 alone doesn't know enough about the 

504 # relationships between data IDs. 

505 for converter in converters: 

506 converter.insertDimensionData() 

507 

508 # Insert dimensions that are potentially shared by all Gen2 

509 # repositories (and are hence managed directly by the Task, rather 

510 # than a converter instance). 

511 # This also finishes setting up the (shared) converter.subsets object 

512 # that is used to filter data IDs for config.relatedOnly. 

513 self.registerUsedSkyMaps(rootConverter.subset) 

514 self.registerUsedSkyPix(rootConverter.subset) 

515 

516 # Look for datasets, generally by scanning the filesystem. 

517 # This requires dimensions to have already been inserted so we can use 

518 # dimension information to identify related datasets. 

519 for converter in converters: 

520 converter.findDatasets() 

521 

522 # Expand data IDs. 

523 for converter in converters: 

524 converter.expandDataIds() 

525 

526 # Actually ingest datasets. 

527 for converter in converters: 

528 converter.ingest()