Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"] 

24 

25import os 

26import fnmatch 

27from dataclasses import dataclass 

28from typing import Iterable, Optional, List, Dict 

29 

30from lsst.utils import doImport 

31from lsst.daf.butler import ( 

32 Butler as Butler3, 

33 SkyPixDimension 

34) 

35from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

36from lsst.pipe.base import Task 

37from lsst.skymap import skyMapRegistry, BaseSkyMap 

38 

39from ..ingest import RawIngestTask 

40from .repoConverter import ConversionSubset 

41from .rootRepoConverter import RootRepoConverter 

42from .calibRepoConverter import CalibRepoConverter 

43from .standardRepoConverter import StandardRepoConverter 

44 

45 

46@dataclass 

47class ConfiguredSkyMap: 

48 """Struct containing information about a skymap that may appear in a Gen2 

49 repository. 

50 """ 

51 

52 name: str 

53 """Name of the skymap used in Gen3 data IDs. 

54 """ 

55 

56 sha1: bytes 

57 """Hash computed by `BaseSkyMap.getSha1`. 

58 """ 

59 

60 instance: BaseSkyMap 

61 """Name of the skymap used in Gen3 data IDs. 

62 """ 

63 

64 used: bool = False 

65 """Whether this skymap has been found in at least one repository being 

66 converted. 

67 """ 

68 

69 

70class ConvertRepoSkyMapConfig(Config): 

71 """Sub-config used to hold the parameters of a SkyMap. 

72 

73 Notes 

74 ----- 

75 This config only needs to exist because we can't put a 

76 `~lsst.pex.config.RegistryField` directly inside a 

77 `~lsst.pex.config.ConfigDictField`. 

78 

79 It needs to have its only field named "skyMap" for compatibility with the 

80 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

81 use one config file in an obs package to configure both. 

82 

83 This name leads to unfortunate repetition with the field named 

84 "skymap" that holds it - "skyMap[name].skyMap" - but that seems 

85 unavoidable. 

86 """ 

87 skyMap = skyMapRegistry.makeField( 

88 doc="Type and parameters for the SkyMap itself.", 

89 default="dodeca", 

90 ) 

91 

92 

93class ConvertRepoConfig(Config): 

94 raws = ConfigurableField( 

95 "Configuration for subtask responsible for ingesting raws and adding " 

96 "visit and exposure dimension entries.", 

97 target=RawIngestTask, 

98 ) 

99 skyMaps = ConfigDictField( 

100 "Mapping from Gen3 skymap name to the parameters used to construct a " 

101 "BaseSkyMap instance. This will be used to associate names with " 

102 "existing skymaps found in the Gen2 repo.", 

103 keytype=str, 

104 itemtype=ConvertRepoSkyMapConfig, 

105 default={} 

106 ) 

107 rootSkyMapName = Field( 

108 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for " 

109 "datasets in the root repository when no SkyMap is found there. ", 

110 dtype=str, 

111 optional=True, 

112 default=None, 

113 ) 

114 collections = DictField( 

115 "Special collections (values) for certain dataset types (keys). " 

116 "These are used in addition to rerun collections for datasets in " 

117 "reruns. The 'raw' dataset must have an entry here if it is to be " 

118 "converted.", 

119 keytype=str, 

120 itemtype=str, 

121 default={ 

122 "deepCoadd_skyMap": "skymaps", 

123 "brightObjectMask": "masks", 

124 } 

125 ) 

126 storageClasses = DictField( 

127 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

128 "or 'persistable') to the Gen3 StorageClass name.", 

129 keytype=str, 

130 itemtype=str, 

131 default={ 

132 "BaseSkyMap": "SkyMap", 

133 "BaseCatalog": "Catalog", 

134 "BackgroundList": "Background", 

135 "raw": "Exposure", 

136 "MultilevelParquetTable": "DataFrame", 

137 "ParquetTable": "DataFrame", 

138 "SkyWcs": "Wcs", 

139 } 

140 ) 

141 doRegisterInstrument = Field( 

142 "If True (default), add dimension records for the Instrument and its " 

143 "filters and detectors to the registry instead of assuming they are " 

144 "already present.", 

145 dtype=bool, 

146 default=True, 

147 ) 

148 doWriteCuratedCalibrations = Field( 

149 "If True (default), ingest human-curated calibrations directly via " 

150 "the Instrument interface. Note that these calibrations are never " 

151 "converted from Gen2 repositories.", 

152 dtype=bool, 

153 default=True, 

154 ) 

155 refCats = ListField( 

156 "The names of reference catalogs (subdirectories under ref_cats) to " 

157 "be converted", 

158 dtype=str, 

159 default=[] 

160 ) 

161 fileIgnorePatterns = ListField( 

162 "Filename globs that should be ignored instead of being treated as " 

163 "datasets.", 

164 dtype=str, 

165 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3", 

166 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper", 

167 "_parent", "repositoryCfg.yaml"] 

168 ) 

169 datasetIncludePatterns = ListField( 

170 "Glob-style patterns for dataset type names that should be converted.", 

171 dtype=str, 

172 default=["*"] 

173 ) 

174 datasetIgnorePatterns = ListField( 

175 "Glob-style patterns for dataset type names that should not be " 

176 "converted despite matching a pattern in datasetIncludePatterns.", 

177 dtype=str, 

178 default=[] 

179 ) 

180 ccdKey = Field( 

181 "Key used for the Gen2 equivalent of 'detector' in data IDs.", 

182 dtype=str, 

183 default="ccd", 

184 ) 

185 relatedOnly = Field( 

186 "If True (default), only convert datasets that are related to the " 

187 "ingested visits. Ignored unless a list of visits is passed to " 

188 "run().", 

189 dtype=bool, 

190 default=False, 

191 ) 

192 

193 @property 

194 def transfer(self): 

195 return self.raws.transfer 

196 

197 @transfer.setter 

198 def transfer(self, value): 

199 self.raws.transfer = value 

200 

201 @property 

202 def instrument(self): 

203 return self.raws.instrument 

204 

205 @instrument.setter 

206 def instrument(self, value): 

207 self.raws.instrument = value 

208 

209 def setDefaults(self): 

210 self.transfer = None 

211 

212 # TODO: check that there are no collection overrides for curated 

213 # calibrations, since we don't have a good way to utilize them. 

214 

215 

216class ConvertRepoTask(Task): 

217 """A task that converts one or more related Gen2 data repositories to a 

218 single Gen3 data repository (with multiple collections). 

219 

220 Parameters 

221 ---------- 

222 config: `ConvertRepoConfig` 

223 Configuration for this task. 

224 butler3: `lsst.daf.butler.Butler` 

225 Gen3 Butler instance that represents the data repository datasets will 

226 be ingested into. The collection and/or run associated with this 

227 Butler will be ignored in favor of collections/runs passed via config 

228 or to `run`. 

229 kwds 

230 Other keyword arguments are forwarded to the `Task` constructor. 

231 

232 Notes 

233 ----- 

234 Most of the work of converting repositories is delegated to instances of 

235 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

236 only state that is relevant for all Gen2 repositories being ingested, while 

237 each `RepoConverter` instance holds only state relevant for the conversion 

238 of a single Gen2 repository. Both the task and the `RepoConverter` 

239 instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

240 methods may only be called once on a particular instance. 

241 """ 

242 

243 ConfigClass = ConvertRepoConfig 

244 

245 _DefaultName = "convertRepo" 

246 

247 def __init__(self, config=None, *, butler3: Butler3, **kwds): 

248 super().__init__(config, **kwds) 

249 self.butler3 = butler3 

250 self.registry = self.butler3.registry 

251 self.universe = self.registry.dimensions 

252 if self.isDatasetTypeIncluded("raw"): 

253 self.makeSubtask("raws", butler=butler3) 

254 self.instrument = self.raws.instrument 

255 else: 

256 self.raws = None 

257 self.instrument = doImport(self.config.instrument)() 

258 self._configuredSkyMapsBySha1 = {} 

259 self._configuredSkyMapsByName = {} 

260 for name, config in self.config.skyMaps.items(): 

261 instance = config.skyMap.apply() 

262 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

263 self._configuredSkyMapsBySha1[struct.sha1] = struct 

264 self._configuredSkyMapsByName[struct.name] = struct 

265 self._usedSkyPix = set() 

266 

267 def isDatasetTypeIncluded(self, datasetTypeName: str): 

268 """Return `True` if configuration indicates that the given dataset type 

269 should be converted. 

270 

271 This method is intended to be called primarily by the 

272 `RepoConverter` instances used interally by the task. 

273 

274 Parameters 

275 ---------- 

276 datasetTypeName: str 

277 Name of the dataset type. 

278 

279 Returns 

280 ------- 

281 included : `bool` 

282 Whether the dataset should be included in the conversion. 

283 """ 

284 return ( 

285 any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

286 for pattern in self.config.datasetIncludePatterns) 

287 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

288 for pattern in self.config.datasetIgnorePatterns) 

289 ) 

290 

291 def useSkyMap(self, skyMap: BaseSkyMap) -> str: 

292 """Indicate that a repository uses the given SkyMap. 

293 

294 This method is intended to be called primarily by the 

295 `RepoConverter` instances used interally by the task. 

296 

297 Parameters 

298 ---------- 

299 skyMap : `lsst.skymap.BaseSkyMap` 

300 SkyMap instance being used, typically retrieved from a Gen2 

301 data repository. 

302 

303 Returns 

304 ------- 

305 name : `str` 

306 The name of the skymap in Gen3 data IDs. 

307 """ 

308 sha1 = skyMap.getSha1() 

309 try: 

310 struct = self._configuredSkyMapsBySha1[sha1] 

311 except KeyError as err: 

312 raise LookupError(f"SkyMap with sha1={sha1} not included in configuration.") from err 

313 struct.used = True 

314 return struct.name 

315 

316 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

317 """Register all skymaps that have been marked as used. 

318 

319 This method is intended to be called primarily by the 

320 `RepoConverter` instances used interally by the task. 

321 

322 Parameters 

323 ---------- 

324 subset : `ConversionSubset`, optional 

325 Object that will be used to filter converted datasets by data ID. 

326 If given, it will be updated with the tracts of this skymap that 

327 overlap the visits in the subset. 

328 """ 

329 for struct in self._configuredSkyMapsBySha1.values(): 

330 if struct.used: 

331 struct.instance.register(struct.name, self.registry) 

332 if subset is not None and self.config.relatedOnly: 

333 subset.addSkyMap(self.registry, struct.name) 

334 

335 def useSkyPix(self, dimension: SkyPixDimension): 

336 """Indicate that a repository uses the given SkyPix dimension. 

337 

338 This method is intended to be called primarily by the 

339 `RepoConverter` instances used interally by the task. 

340 

341 Parameters 

342 ---------- 

343 dimension : `lsst.daf.butler.SkyPixDimension` 

344 Dimension represening a pixelization of the sky. 

345 """ 

346 self._usedSkyPix.add(dimension) 

347 

348 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

349 """Register all skymaps that have been marked as used. 

350 

351 This method is intended to be called primarily by the 

352 `RepoConverter` instances used interally by the task. 

353 

354 Parameters 

355 ---------- 

356 subset : `ConversionSubset`, optional 

357 Object that will be used to filter converted datasets by data ID. 

358 If given, it will be updated with the pixelization IDs that 

359 overlap the visits in the subset. 

360 """ 

361 if subset is not None and self.config.relatedOnly: 

362 for dimension in self._usedSkyPix: 

363 subset.addSkyPix(self.registry, dimension) 

364 

365 def run(self, root: str, collections: List[str], *, 

366 calibs: Dict[str, List[str]] = None, 

367 reruns: Dict[str, List[str]] = None, 

368 visits: Optional[Iterable[int]] = None): 

369 """Convert a group of related data repositories. 

370 

371 Parameters 

372 ---------- 

373 root : `str` 

374 Complete path to the root Gen2 data repository. This should be 

375 a data repository that includes a Gen2 registry and any raw files 

376 and/or reference catalogs. 

377 collections : `list` of `str` 

378 Gen3 collections that datasets from the root repository should be 

379 associated with. This should include any rerun collection that 

380 these datasets should also be considered to be part of; because of 

381 structural difference between Gen2 parent/child relationships and 

382 Gen3 collections, these cannot be reliably inferred. 

383 calibs : `dict` 

384 Dictionary mapping calibration repository path to the collections 

385 that the repository's datasets should be associated with. The path 

386 may be relative to ``root`` or absolute. Collections should 

387 include child repository collections as appropriate (see 

388 documentation for ``collections``). 

389 reruns : `dict` 

390 Dictionary mapping rerun repository path to the collections that 

391 the repository's datasets should be associated with. The path may 

392 be relative to ``root`` or absolute. Collections should include 

393 child repository collections as appropriate (see documentation for 

394 ``collections``). 

395 visits : iterable of `int`, optional 

396 The integer IDs of visits to convert. If not provided, all visits 

397 in the Gen2 root repository will be converted. 

398 """ 

399 

400 if calibs is None: 

401 calibs = {} 

402 if reruns is None: 

403 reruns = {} 

404 if visits is not None: 

405 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

406 else: 

407 if self.config.relatedOnly: 

408 self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

409 "no filtering will be done.") 

410 subset = None 

411 

412 # We can't wrap database writes sanely in transactions (yet) because we 

413 # keep initializing new Butler instances just so we can write into new 

414 # runs/collections, and transactions are managed at the Butler level. 

415 # DM-21246 should let us fix this, assuming we actually want to keep 

416 # the transaction open that long. 

417 if self.config.doRegisterInstrument: 

418 self.instrument.register(self.registry) 

419 

420 # Make and prep converters for all Gen2 repos. This should not modify 

421 # the Registry database or filesystem at all, though it may query it. 

422 # The prep() calls here will be some of the slowest ones, because 

423 # that's when we walk the filesystem. 

424 converters = [] 

425 rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset) 

426 rootConverter.prep() 

427 converters.append(rootConverter) 

428 

429 for root, collections in calibs.items(): 

430 if not os.path.isabs(root): 

431 root = os.path.join(rootConverter.root, root) 

432 converter = CalibRepoConverter(task=self, root=root, collections=collections, 

433 mapper=rootConverter.mapper, 

434 subset=rootConverter.subset) 

435 converter.prep() 

436 converters.append(converter) 

437 

438 for root, collections in reruns.items(): 

439 if not os.path.isabs(root): 

440 root = os.path.join(rootConverter.root, root) 

441 converter = StandardRepoConverter(task=self, root=root, collections=collections, 

442 subset=rootConverter.subset) 

443 converter.prep() 

444 converters.append(converter) 

445 

446 # Actual database writes start here. We can't wrap these sanely in 

447 # transactions (yet) because we keep initializing new Butler instances 

448 # just so we can write into new runs/collections, and transactions 

449 # are managed at the Butler level (DM-21246 should let us fix this). 

450 

451 # Insert dimensions needed by any converters. These are only the 

452 # dimensions that a converter expects to be uniquely derived from the 

453 # Gen2 repository it is reponsible for - e.g. visits, exposures, and 

454 # calibration_labels. 

455 # 

456 # Note that we do not try to filter dimensions down to just those 

457 # related to the given visits, even if config.relatedOnly is True; we 

458 # need them in the Gen3 repo in order to be able to know which datasets 

459 # to convert, because Gen2 alone doesn't know enough about the 

460 # relationships between data IDs. 

461 for converter in converters: 

462 converter.insertDimensionData() 

463 

464 # Insert dimensions that are potentially shared by all Gen2 

465 # repositories (and are hence managed directly by the Task, rather 

466 # than a converter instance). 

467 # This also finishes setting up the (shared) converter.subsets object 

468 # that is used to filter data IDs for config.relatedOnly. 

469 self.registerUsedSkyMaps(rootConverter.subset) 

470 self.registerUsedSkyPix(rootConverter.subset) 

471 

472 # Look for datasets, generally by scanning the filesystem. 

473 # This requires dimensions to have already been inserted so we can use 

474 # dimension information to identify related datasets. 

475 for converter in converters: 

476 converter.findDatasets() 

477 

478 # Expand data IDs. 

479 for converter in converters: 

480 converter.expandDataIds() 

481 

482 # Actually ingest datasets. 

483 for converter in converters: 

484 converter.ingest()