Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RepoConverter"] 

24 

25from dataclasses import dataclass 

26from collections import defaultdict 

27from abc import ABC, abstractmethod 

28import fnmatch 

29import re 

30from typing import ( 

31 Dict, 

32 Iterator, 

33 List, 

34 MutableMapping, 

35 Optional, 

36 Set, 

37 Tuple, 

38 Union, 

39 TYPE_CHECKING, 

40) 

41 

42from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType 

43from lsst.sphgeom import RangeSet, Region 

44from .repoWalker import RepoWalker 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping 

48 from .convertRepo import ConvertRepoTask 

49 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension 

50 

51 

52@dataclass 

53class ConversionSubset: 

54 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 

55 lists of related data ID values that should be included in the conversion. 

56 

57 Parameters 

58 ---------- 

59 instrument : `str` 

60 Instrument name used in Gen3 data IDs. 

61 visits : `set` of `int` 

62 Visit IDs that define the filter. 

63 """ 

64 

65 def __init__(self, instrument: str, visits: Set[int]): 

66 self.instrument = instrument 

67 self.visits = visits 

68 self.regions = None 

69 self.tracts = {} 

70 self.skypix = {} 

71 

72 def addSkyMap(self, registry: Registry, name: str): 

73 """Populate the included tract IDs for the given skymap from those that 

74 overlap the visits the `ConversionSubset` was initialized with. 

75 

76 Parameters 

77 ---------- 

78 registry : `lsst.daf.butler.Registry` 

79 Registry that can be queried for visit/tract overlaps. 

80 name : `str` 

81 SkyMap name used in Gen3 data IDs. 

82 """ 

83 tracts = set() 

84 self.tracts[name] = tracts 

85 for visit in self.visits: 

86 for dataId in registry.queryDimensions(["tract"], expand=False, 

87 dataId={"skymap": name, 

88 "instrument": self.instrument, 

89 "visit": visit}): 

90 tracts.add(dataId["tract"]) 

91 

92 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension): 

93 """Populate the included skypix IDs for the given dimension from those 

94 that overlap the visits the `ConversionSubset` was initialized with. 

95 

96 Parameters 

97 ---------- 

98 registry : `lsst.daf.butler.Registry` 

99 Registry that can be queried for visit regions. 

100 name : `str` 

101 SkyMap name used in Gen3 data IDs. 

102 """ 

103 if self.regions is None: 

104 self.regions = [] 

105 for visit in self.visits: 

106 dataId = registry.expandDataId(instrument=self.instrument, visit=visit) 

107 self.regions.append(dataId.region) 

108 ranges = RangeSet() 

109 for region in self.regions: 

110 ranges = ranges.union(dimension.pixelization.envelope(region)) 

111 self.skypix[dimension] = ranges 

112 

113 def isRelated(self, dataId: DataCoordinate) -> bool: 

114 """Test whether the given data ID is related to this subset and hence 

115 should be included in a repository conversion. 

116 

117 Parameters 

118 ---------- 

119 dataId : `lsst.daf.butler.DataCoordinate` 

120 Data ID to test. 

121 

122 Returns 

123 ------- 

124 related : `bool` 

125 `True` if this data ID should be included in a repository 

126 conversion. 

127 

128 Notes 

129 ----- 

130 More formally, this tests that the given data ID is not unrelated; 

131 if a data ID does not involve tracts, visits, or skypix dimensions, 

132 we always include it. 

133 """ 

134 if self.visits is None: 

135 # We're not filtering at all. 

136 return True 

137 if "visit" in dataId.graph and dataId["visit"] not in self.visits: 

138 return False 

139 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]: 

140 return False 

141 for dimension, ranges in self.skypix.items(): 

142 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]): 

143 return False 

144 return True 

145 

146 # Class attributes that will be shadowed by public instance attributes; 

147 # defined here only for documentation purposes. 

148 

149 instrument: str 

150 """The name of the instrument, as used in Gen3 data IDs (`str`). 

151 """ 

152 

153 visits: Set[int] 

154 """The set of visit IDs that should be included in the conversion (`set` 

155 of `int`). 

156 """ 

157 

158 regions: Optional[List[Region]] 

159 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 

160 

161 Set to `None` before it has been initialized. Any code that attempts to 

162 use it when it is `None` has a logic bug. 

163 """ 

164 

165 tracts: Dict[str, Set[int]] 

166 """Tracts that should be included in the conversion, grouped by skymap 

167 name (`dict` mapping `str` to `set` of `int`). 

168 """ 

169 

170 skypix: Dict[SkyPixDimension, RangeSet] 

171 """SkyPix ranges that should be included in the conversion, grouped by 

172 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 

173 """ 

174 

175 

176class RepoConverter(ABC): 

177 """An abstract base class for objects that help `ConvertRepoTask` convert 

178 datasets from a single Gen2 repository. 

179 

180 Parameters 

181 ---------- 

182 task : `ConvertRepoTask` 

183 Task instance that is using this helper object. 

184 root : `str` 

185 Root of the Gen2 repo being converted. 

186 collections : `list` of `str` 

187 Gen3 collections with which all converted datasets should be 

188 associated. 

189 subset : `ConversionSubset, optional 

190 Helper object that implements a filter that restricts the data IDs that 

191 are converted. 

192 

193 Notes 

194 ----- 

195 `RepoConverter` defines the only public API users of its subclasses should 

196 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 

197 several abstract methods that subclasses must implement. In some cases, 

198 subclasses may reimplement the public methods as well, but are expected to 

199 delegate to ``super()`` either at the beginning or end of their own 

200 implementation. 

201 """ 

202 

203 def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str], 

204 subset: Optional[ConversionSubset] = None): 

205 self.task = task 

206 self.root = root 

207 self.subset = subset 

208 self._collections = list(collections) 

209 self._repoWalker = None # Created in prep 

210 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list) 

211 

212 @abstractmethod 

213 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

214 """Test whether the given dataset is handled specially by this 

215 converter and hence should be ignored by generic base-class logic that 

216 searches for dataset types to convert. 

217 

218 Parameters 

219 ---------- 

220 datasetTypeName : `str` 

221 Name of the dataset type to test. 

222 

223 Returns 

224 ------- 

225 special : `bool` 

226 `True` if the dataset type is special. 

227 """ 

228 raise NotImplementedError() 

229 

230 @abstractmethod 

231 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]: 

232 """Iterate over all `CameraMapper` `Mapping` objects that should be 

233 considered for conversion by this repository. 

234 

235 This this should include any datasets that may appear in the 

236 repository, including those that are special (see 

237 `isDatasetTypeSpecial`) and those that are being ignored (see 

238 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 

239 to identify and hence skip these datasets quietly instead of warning 

240 about them as unrecognized. 

241 

242 Yields 

243 ------ 

244 datasetTypeName: `str` 

245 Name of the dataset type. 

246 mapping : `lsst.obs.base.mapping.Mapping` 

247 Mapping object used by the Gen2 `CameraMapper` to describe the 

248 dataset type. 

249 """ 

250 raise NotImplementedError() 

251 

252 @abstractmethod 

253 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type], 

254 storageClass: StorageClass) -> RepoWalker.Target: 

255 """Make a struct that identifies a dataset type to be extracted by 

256 walking the repo directory structure. 

257 

258 Parameters 

259 ---------- 

260 datasetTypeName : `str` 

261 Name of the dataset type (the same in both Gen2 and Gen3). 

262 template : `str` 

263 The full Gen2 filename template. 

264 keys : `dict` [`str`, `type`] 

265 A dictionary mapping Gen2 data ID key to the type of its value. 

266 storageClass : `lsst.daf.butler.StorageClass` 

267 Gen3 storage class for this dataset type. 

268 

269 Returns 

270 ------- 

271 target : `RepoWalker.Target` 

272 A struct containing information about the target dataset (much of 

273 it simplify forwarded from the arguments). 

274 """ 

275 raise NotImplementedError() 

276 

277 def getSpecialDirectories(self) -> List[str]: 

278 """Return a list of directory paths that should not be searched for 

279 files. 

280 

281 These may be directories that simply do not contain datasets (or 

282 contain datasets in another repository), or directories whose datasets 

283 are handled specially by a subclass. 

284 

285 Returns 

286 ------- 

287 directories : `list` [`str`] 

288 The full paths of directories to skip, relative to the repository 

289 root. 

290 """ 

291 return [] 

292 

293 def prep(self): 

294 """Perform preparatory work associated with the dataset types to be 

295 converted from this repository (but not the datasets themselves). 

296 

297 Notes 

298 ----- 

299 This should be a relatively fast operation that should not depend on 

300 the size of the repository. 

301 

302 Subclasses may override this method, but must delegate to the base 

303 class implementation at some point in their own logic. 

304 More often, subclasses will specialize the behavior of `prep` by 

305 overriding other methods to which the base class implementation 

306 delegates. These include: 

307 - `iterMappings` 

308 - `isDatasetTypeSpecial` 

309 - `getSpecialDirectories` 

310 - `makeRepoWalkerTarget` 

311 

312 This should not perform any write operations to the Gen3 repository. 

313 It is guaranteed to be called before `insertDimensionData`. 

314 """ 

315 self.task.log.info(f"Preparing other dataset types from root {self.root}.") 

316 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = [] 

317 for datasetTypeName, mapping in self.iterMappings(): 

318 try: 

319 template = mapping.template 

320 except RuntimeError: 

321 # No template for this dataset in this mapper, so there's no 

322 # way there should be instances of this dataset in this repo. 

323 continue 

324 skip = False 

325 message = None 

326 storageClass = None 

327 if (not self.task.isDatasetTypeIncluded(datasetTypeName) 

328 or self.isDatasetTypeSpecial(datasetTypeName)): 

329 # User indicated not to include this data, but we still want 

330 # to recognize files of that type to avoid warning about them. 

331 skip = True 

332 else: 

333 storageClass = self._guessStorageClass(datasetTypeName, mapping) 

334 if storageClass is None: 

335 # This may be a problem, but only if we actually encounter any 

336 # files corresponding to this dataset. Of course, we need 

337 # to be able to parse those files in order to recognize that 

338 # situation. 

339 message = f"no storage class found for {datasetTypeName}" 

340 skip = True 

341 if skip: 

342 walkerInput = RepoWalker.Skip( 

343 template=template, 

344 keys=mapping.keys(), 

345 message=message, 

346 ) 

347 else: 

348 assert message is None 

349 walkerInput = self.makeRepoWalkerTarget( 

350 datasetTypeName=datasetTypeName, 

351 template=template, 

352 keys=mapping.keys(), 

353 storageClass=storageClass, 

354 ) 

355 walkerInputs.append(walkerInput) 

356 for dirPath in self.getSpecialDirectories(): 

357 walkerInputs.append( 

358 RepoWalker.Skip( 

359 template=dirPath, # not really a template, but that's fine; it's relative to root. 

360 keys={}, 

361 message=None, 

362 isForFiles=True, 

363 ) 

364 ) 

365 fileIgnoreRegExTerms = [] 

366 for pattern in self.task.config.fileIgnorePatterns: 

367 fileIgnoreRegExTerms.append(fnmatch.translate(pattern)) 

368 if fileIgnoreRegExTerms: 

369 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms)) 

370 else: 

371 fileIgnoreRegEx = None 

372 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx) 

373 

374 def iterDatasets(self) -> Iterator[FileDataset]: 

375 """Iterate over datasets in the repository that should be ingested into 

376 the Gen3 repository. 

377 

378 The base class implementation yields nothing; the datasets handled by 

379 the `RepoConverter` base class itself are read directly in 

380 `findDatasets`. 

381 

382 Subclasses should override this method if they support additional 

383 datasets that are handled some other way. 

384 

385 Yields 

386 ------ 

387 dataset : `FileDataset` 

388 Structures representing datasets to be ingested. Paths should be 

389 absolute. 

390 """ 

391 yield from () 

392 

393 def findDatasets(self): 

394 assert self._repoWalker, "prep() must be called before findDatasets." 

395 self.task.log.info("Adding special datasets in repo %s.", self.root) 

396 for dataset in self.iterDatasets(): 

397 assert len(dataset.refs) == 1 

398 self._fileDatasets[dataset.refs[0].datasetType].append(dataset) 

399 self.task.log.info("Finding datasets from files in repo %s.", self.root) 

400 self._fileDatasets.update( 

401 self._repoWalker.walk( 

402 self.root, 

403 log=self.task.log, 

404 predicate=(self.subset.isRelated if self.subset is not None else None) 

405 ) 

406 ) 

407 

408 def insertDimensionData(self): 

409 """Insert any dimension records uniquely derived from this repository 

410 into the registry. 

411 

412 Subclasses may override this method, but may not need to; the default 

413 implementation does nothing. 

414 

415 SkyMap and SkyPix dimensions should instead be handled by calling 

416 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 

417 these dimensions are in general shared by multiple Gen2 repositories. 

418 

419 This method is guaranteed to be called between `prep` and 

420 `expandDataIds`. 

421 """ 

422 pass 

423 

424 def handleDataIdExpansionFailure(self, dataset: FileDataset, err: LookupError): 

425 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err) 

426 return False 

427 

428 def expandDataIds(self): 

429 """Expand the data IDs for all datasets to be inserted. 

430 

431 Subclasses may override this method, but must delegate to the base 

432 class implementation if they do. If they wish to handle expected 

433 failures in data ID expansion, they should override 

434 `handleDataIdExpansionFailure` instead. 

435 

436 This involves queries to the registry, but not writes. It is 

437 guaranteed to be called between `insertDimensionData` and `ingest`. 

438 """ 

439 for datasetType, datasetsForType in self._fileDatasets.items(): 

440 self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType), 

441 datasetType.name) 

442 expanded = [] 

443 for dataset in datasetsForType: 

444 for i, ref in enumerate(dataset.refs): 

445 try: 

446 dataId = self.task.registry.expandDataId(ref.dataId) 

447 dataset.refs[i] = ref.expanded(dataId) 

448 expanded.append(dataset) 

449 except LookupError as err: 

450 if self.handleDataIdExpansionFailure(dataset, err): 

451 expanded.append(dataset) 

452 datasetsForType[:] = expanded 

453 

454 def ingest(self): 

455 """Insert converted datasets into the Gen3 repository. 

456 

457 Subclasses may override this method, but must delegate to the base 

458 class implementation at some point in their own logic. 

459 

460 This method is guaranteed to be called after `expandDataIds`. 

461 """ 

462 for datasetType, datasetsForType in self._fileDatasets.items(): 

463 self.task.registry.registerDatasetType(datasetType) 

464 self.task.log.info("Ingesting %s %s datasets.", len(datasetsForType), datasetType.name) 

465 try: 

466 collections = self.getCollections(datasetType.name) 

467 except LookupError as err: 

468 self.task.log.warn(str(err)) 

469 continue 

470 try: 

471 self.task.registry.registerRun(collections[0]) 

472 self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer, 

473 run=collections[0]) 

474 except LookupError as err: 

475 raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err 

476 for collection in collections[1:]: 

477 self.task.registry.associate(collection, 

478 [ref for dataset in datasetsForType for ref in dataset.refs]) 

479 

480 def getCollections(self, datasetTypeName: str) -> List[str]: 

481 """Return the set of collections a particular dataset type should be 

482 associated with. 

483 

484 Parameters 

485 ---------- 

486 datasetTypeName : `str` 

487 Name of the dataset type. 

488 

489 Returns 

490 ------- 

491 collections : `list` of `str` 

492 Collections the dataset should be associated with. The first 

493 item in the list is the run the dataset should be added to 

494 initially. 

495 """ 

496 if datasetTypeName in self.task.config.collections: 

497 return [self.task.config.collections[datasetTypeName]] + self._collections 

498 elif self._collections: 

499 return self._collections 

500 else: 

501 raise LookupError("No collection configured for dataset type {datasetTypeName}.") 

502 

503 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping 

504 ) -> Optional[StorageClass]: 

505 """Infer the Gen3 `StorageClass` from a dataset from a combination of 

506 configuration and Gen2 dataset type information. 

507 

508 datasetTypeName: `str` 

509 Name of the dataset type. 

510 mapping : `lsst.obs.base.mapping.Mapping` 

511 Mapping object used by the Gen2 `CameraMapper` to describe the 

512 dataset type. 

513 """ 

514 storageClassName = self.task.config.storageClasses.get(datasetTypeName) 

515 if storageClassName is None and mapping.python is not None: 

516 storageClassName = self.task.config.storageClasses.get(mapping.python, None) 

517 if storageClassName is None and mapping.persistable is not None: 

518 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None) 

519 if storageClassName is None and mapping.python is not None: 

520 unqualified = mapping.python.split(".")[-1] 

521 storageClassName = self.task.config.storageClasses.get(unqualified, None) 

522 if storageClassName is not None: 

523 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName) 

524 else: 

525 try: 

526 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable) 

527 except KeyError: 

528 storageClass = None 

529 if storageClass is None and mapping.python is not None: 

530 try: 

531 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified) 

532 except KeyError: 

533 pass 

534 if storageClass is None: 

535 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName) 

536 else: 

537 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName) 

538 return storageClass 

539 

540 # Class attributes that will be shadowed by public instance attributes; 

541 # defined here only for documentation purposes. 

542 

543 task: ConvertRepoTask 

544 """The parent task that constructed and uses this converter 

545 (`ConvertRepoTask`). 

546 """ 

547 

548 root: str 

549 """Root path to the Gen2 repository this converter manages (`str`). 

550 

551 This is a complete path, not relative to some other repository root. 

552 """ 

553 

554 subset: Optional[ConversionSubset] 

555 """An object that represents a filter to be applied to the datasets that 

556 are converted (`ConversionSubset` or `None`). 

557 """