Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RepoConverter"] 

24 

25from dataclasses import dataclass 

26from collections import defaultdict 

27from abc import ABC, abstractmethod 

28import fnmatch 

29import re 

30from typing import ( 

31 Dict, 

32 Iterator, 

33 List, 

34 MutableMapping, 

35 Optional, 

36 Set, 

37 Tuple, 

38 Union, 

39 TYPE_CHECKING, 

40) 

41 

42from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType 

43from lsst.sphgeom import RangeSet, Region 

44from .repoWalker import RepoWalker 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping 

48 from .convertRepo import ConvertRepoTask 

49 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension 

50 

51 

52@dataclass 

53class ConversionSubset: 

54 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 

55 lists of related data ID values that should be included in the conversion. 

56 

57 Parameters 

58 ---------- 

59 instrument : `str` 

60 Instrument name used in Gen3 data IDs. 

61 visits : `set` of `int` 

62 Visit IDs that define the filter. 

63 """ 

64 

65 def __init__(self, instrument: str, visits: Set[int]): 

66 self.instrument = instrument 

67 self.visits = visits 

68 self.regions = None 

69 self.tracts = {} 

70 self.skypix = {} 

71 

72 def addSkyMap(self, registry: Registry, name: str): 

73 """Populate the included tract IDs for the given skymap from those that 

74 overlap the visits the `ConversionSubset` was initialized with. 

75 

76 Parameters 

77 ---------- 

78 registry : `lsst.daf.butler.Registry` 

79 Registry that can be queried for visit/tract overlaps. 

80 name : `str` 

81 SkyMap name used in Gen3 data IDs. 

82 """ 

83 tracts = set() 

84 self.tracts[name] = tracts 

85 for visit in self.visits: 

86 for dataId in registry.queryDimensions(["tract"], expand=False, 

87 dataId={"skymap": name, 

88 "instrument": self.instrument, 

89 "visit": visit}): 

90 tracts.add(dataId["tract"]) 

91 

92 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension): 

93 """Populate the included skypix IDs for the given dimension from those 

94 that overlap the visits the `ConversionSubset` was initialized with. 

95 

96 Parameters 

97 ---------- 

98 registry : `lsst.daf.butler.Registry` 

99 Registry that can be queried for visit regions. 

100 name : `str` 

101 SkyMap name used in Gen3 data IDs. 

102 """ 

103 if self.regions is None: 

104 self.regions = [] 

105 for visit in self.visits: 

106 dataId = registry.expandDataId(instrument=self.instrument, visit=visit) 

107 self.regions.append(dataId.region) 

108 ranges = RangeSet() 

109 for region in self.regions: 

110 ranges = ranges.union(dimension.pixelization.envelope(region)) 

111 self.skypix[dimension] = ranges 

112 

113 def isRelated(self, dataId: DataCoordinate) -> bool: 

114 """Test whether the given data ID is related to this subset and hence 

115 should be included in a repository conversion. 

116 

117 Parameters 

118 ---------- 

119 dataId : `lsst.daf.butler.DataCoordinate` 

120 Data ID to test. 

121 

122 Returns 

123 ------- 

124 related : `bool` 

125 `True` if this data ID should be included in a repository 

126 conversion. 

127 

128 Notes 

129 ----- 

130 More formally, this tests that the given data ID is not unrelated; 

131 if a data ID does not involve tracts, visits, or skypix dimensions, 

132 we always include it. 

133 """ 

134 if self.visits is None: 

135 # We're not filtering at all. 

136 return True 

137 if "visit" in dataId.graph and dataId["visit"] not in self.visits: 

138 return False 

139 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]: 

140 return False 

141 for dimension, ranges in self.skypix.items(): 

142 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]): 

143 return False 

144 return True 

145 

146 # Class attributes that will be shadowed by public instance attributes; 

147 # defined here only for documentation purposes. 

148 

149 instrument: str 

150 """The name of the instrument, as used in Gen3 data IDs (`str`). 

151 """ 

152 

153 visits: Set[int] 

154 """The set of visit IDs that should be included in the conversion (`set` 

155 of `int`). 

156 """ 

157 

158 regions: Optional[List[Region]] 

159 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 

160 

161 Set to `None` before it has been initialized. Any code that attempts to 

162 use it when it is `None` has a logic bug. 

163 """ 

164 

165 tracts: Dict[str, Set[int]] 

166 """Tracts that should be included in the conversion, grouped by skymap 

167 name (`dict` mapping `str` to `set` of `int`). 

168 """ 

169 

170 skypix: Dict[SkyPixDimension, RangeSet] 

171 """SkyPix ranges that should be included in the conversion, grouped by 

172 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 

173 """ 

174 

175 

176class RepoConverter(ABC): 

177 """An abstract base class for objects that help `ConvertRepoTask` convert 

178 datasets from a single Gen2 repository. 

179 

180 Parameters 

181 ---------- 

182 task : `ConvertRepoTask` 

183 Task instance that is using this helper object. 

184 root : `str` 

185 Root of the Gen2 repo being converted. 

186 collections : `list` of `str` 

187 Gen3 collections with which all converted datasets should be 

188 associated. 

189 subset : `ConversionSubset, optional 

190 Helper object that implements a filter that restricts the data IDs that 

191 are converted. 

192 

193 Notes 

194 ----- 

195 `RepoConverter` defines the only public API users of its subclasses should 

196 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 

197 several abstract methods that subclasses must implement. In some cases, 

198 subclasses may reimplement the public methods as well, but are expected to 

199 delegate to ``super()`` either at the beginning or end of their own 

200 implementation. 

201 """ 

202 

203 def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str], 

204 subset: Optional[ConversionSubset] = None): 

205 self.task = task 

206 self.root = root 

207 self.subset = subset 

208 self._collections = list(collections) 

209 self._repoWalker = None # Created in prep 

210 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list) 

211 

212 @abstractmethod 

213 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

214 """Test whether the given dataset is handled specially by this 

215 converter and hence should be ignored by generic base-class logic that 

216 searches for dataset types to convert. 

217 

218 Parameters 

219 ---------- 

220 datasetTypeName : `str` 

221 Name of the dataset type to test. 

222 

223 Returns 

224 ------- 

225 special : `bool` 

226 `True` if the dataset type is special. 

227 """ 

228 raise NotImplementedError() 

229 

230 @abstractmethod 

231 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]: 

232 """Iterate over all `CameraMapper` `Mapping` objects that should be 

233 considered for conversion by this repository. 

234 

235 This this should include any datasets that may appear in the 

236 repository, including those that are special (see 

237 `isDatasetTypeSpecial`) and those that are being ignored (see 

238 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 

239 to identify and hence skip these datasets quietly instead of warning 

240 about them as unrecognized. 

241 

242 Yields 

243 ------ 

244 datasetTypeName: `str` 

245 Name of the dataset type. 

246 mapping : `lsst.obs.base.mapping.Mapping` 

247 Mapping object used by the Gen2 `CameraMapper` to describe the 

248 dataset type. 

249 """ 

250 raise NotImplementedError() 

251 

252 @abstractmethod 

253 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type], 

254 storageClass: StorageClass) -> RepoWalker.Target: 

255 """Make a struct that identifies a dataset type to be extracted by 

256 walking the repo directory structure. 

257 

258 Parameters 

259 ---------- 

260 datasetTypeName : `str` 

261 Name of the dataset type (the same in both Gen2 and Gen3). 

262 template : `str` 

263 The full Gen2 filename template. 

264 keys : `dict` [`str`, `type`] 

265 A dictionary mapping Gen2 data ID key to the type of its value. 

266 storageClass : `lsst.daf.butler.StorageClass` 

267 Gen3 storage class for this dataset type. 

268 

269 Returns 

270 ------- 

271 target : `RepoWalker.Target` 

272 A struct containing information about the target dataset (much of 

273 it simplify forwarded from the arguments). 

274 """ 

275 raise NotImplementedError() 

276 

277 def getSpecialDirectories(self) -> List[str]: 

278 """Return a list of directory paths that should not be searched for 

279 files. 

280 

281 These may be directories that simply do not contain datasets (or 

282 contain datasets in another repository), or directories whose datasets 

283 are handled specially by a subclass. 

284 

285 Returns 

286 ------- 

287 directories : `list` [`str`] 

288 The full paths of directories to skip, relative to the repository 

289 root. 

290 """ 

291 return [] 

292 

293 def prep(self): 

294 """Perform preparatory work associated with the dataset types to be 

295 converted from this repository (but not the datasets themselves). 

296 

297 Notes 

298 ----- 

299 This should be a relatively fast operation that should not depend on 

300 the size of the repository. 

301 

302 Subclasses may override this method, but must delegate to the base 

303 class implementation at some point in their own logic. 

304 More often, subclasses will specialize the behavior of `prep` by 

305 overriding other methods to which the base class implementation 

306 delegates. These include: 

307 - `iterMappings` 

308 - `isDatasetTypeSpecial` 

309 - `getSpecialDirectories` 

310 - `makeRepoWalkerTarget` 

311 

312 This should not perform any write operations to the Gen3 repository. 

313 It is guaranteed to be called before `insertDimensionData`. 

314 """ 

315 self.task.log.info(f"Preparing other dataset types from root {self.root}.") 

316 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = [] 

317 for datasetTypeName, mapping in self.iterMappings(): 

318 try: 

319 template = mapping.template 

320 except RuntimeError: 

321 # No template for this dataset in this mapper, so there's no 

322 # way there should be instances of this dataset in this repo. 

323 continue 

324 extensions = [""] 

325 skip = False 

326 message = None 

327 storageClass = None 

328 if (not self.task.isDatasetTypeIncluded(datasetTypeName) 

329 or self.isDatasetTypeSpecial(datasetTypeName)): 

330 # User indicated not to include this data, but we still want 

331 # to recognize files of that type to avoid warning about them. 

332 skip = True 

333 else: 

334 storageClass = self._guessStorageClass(datasetTypeName, mapping) 

335 if storageClass is None: 

336 # This may be a problem, but only if we actually encounter any 

337 # files corresponding to this dataset. Of course, we need 

338 # to be able to parse those files in order to recognize that 

339 # situation. 

340 message = f"no storage class found for {datasetTypeName}" 

341 skip = True 

342 # Handle files that are compressed on disk, but the gen2 template is just `.fits` 

343 if template.endswith(".fits"): 

344 extensions.extend((".gz", ".fz")) 

345 for extension in extensions: 

346 if skip: 

347 walkerInput = RepoWalker.Skip( 

348 template=template+extension, 

349 keys=mapping.keys(), 

350 message=message, 

351 ) 

352 self.task.log.debug("Skipping template in walker: %s", template) 

353 else: 

354 assert message is None 

355 walkerInput = self.makeRepoWalkerTarget( 

356 datasetTypeName=datasetTypeName, 

357 template=template+extension, 

358 keys=mapping.keys(), 

359 storageClass=storageClass, 

360 ) 

361 self.task.log.debug("Adding template to walker: %s", template) 

362 walkerInputs.append(walkerInput) 

363 

364 for dirPath in self.getSpecialDirectories(): 

365 walkerInputs.append( 

366 RepoWalker.Skip( 

367 template=dirPath, # not really a template, but that's fine; it's relative to root. 

368 keys={}, 

369 message=None, 

370 isForFiles=True, 

371 ) 

372 ) 

373 fileIgnoreRegExTerms = [] 

374 for pattern in self.task.config.fileIgnorePatterns: 

375 fileIgnoreRegExTerms.append(fnmatch.translate(pattern)) 

376 if fileIgnoreRegExTerms: 

377 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms)) 

378 else: 

379 fileIgnoreRegEx = None 

380 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx) 

381 

382 def iterDatasets(self) -> Iterator[FileDataset]: 

383 """Iterate over datasets in the repository that should be ingested into 

384 the Gen3 repository. 

385 

386 The base class implementation yields nothing; the datasets handled by 

387 the `RepoConverter` base class itself are read directly in 

388 `findDatasets`. 

389 

390 Subclasses should override this method if they support additional 

391 datasets that are handled some other way. 

392 

393 Yields 

394 ------ 

395 dataset : `FileDataset` 

396 Structures representing datasets to be ingested. Paths should be 

397 absolute. 

398 """ 

399 yield from () 

400 

401 def findDatasets(self): 

402 assert self._repoWalker, "prep() must be called before findDatasets." 

403 self.task.log.info("Adding special datasets in repo %s.", self.root) 

404 for dataset in self.iterDatasets(): 

405 assert len(dataset.refs) == 1 

406 self._fileDatasets[dataset.refs[0].datasetType].append(dataset) 

407 self.task.log.info("Finding datasets from files in repo %s.", self.root) 

408 self._fileDatasets.update( 

409 self._repoWalker.walk( 

410 self.root, 

411 log=self.task.log, 

412 predicate=(self.subset.isRelated if self.subset is not None else None) 

413 ) 

414 ) 

415 

416 def insertDimensionData(self): 

417 """Insert any dimension records uniquely derived from this repository 

418 into the registry. 

419 

420 Subclasses may override this method, but may not need to; the default 

421 implementation does nothing. 

422 

423 SkyMap and SkyPix dimensions should instead be handled by calling 

424 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 

425 these dimensions are in general shared by multiple Gen2 repositories. 

426 

427 This method is guaranteed to be called between `prep` and 

428 `expandDataIds`. 

429 """ 

430 pass 

431 

432 def expandDataIds(self): 

433 """Expand the data IDs for all datasets to be inserted. 

434 

435 Subclasses may override this method, but must delegate to the base 

436 class implementation if they do. 

437 

438 This involves queries to the registry, but not writes. It is 

439 guaranteed to be called between `insertDimensionData` and `ingest`. 

440 """ 

441 import itertools 

442 for datasetType, datasetsForType in self._fileDatasets.items(): 

443 self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType), 

444 datasetType.name) 

445 expanded = [] 

446 for dataset in datasetsForType: 

447 for i, ref in enumerate(dataset.refs): 

448 try: 

449 dataId = self.task.registry.expandDataId(ref.dataId) 

450 dataset.refs[i] = ref.expanded(dataId) 

451 except LookupError as err: 

452 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err) 

453 # Remove skipped datasets from multi-extension FileDatasets 

454 dataset.refs[i] = None # We will strip off the `None`s after the loop. 

455 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs) 

456 if dataset.refs: 

457 expanded.append(dataset) 

458 

459 datasetsForType[:] = expanded 

460 

461 def ingest(self): 

462 """Insert converted datasets into the Gen3 repository. 

463 

464 Subclasses may override this method, but must delegate to the base 

465 class implementation at some point in their own logic. 

466 

467 This method is guaranteed to be called after `expandDataIds`. 

468 """ 

469 for datasetType, datasetsForType in self._fileDatasets.items(): 

470 self.task.registry.registerDatasetType(datasetType) 

471 self.task.log.info("Ingesting %s %s datasets.", len(datasetsForType), datasetType.name) 

472 try: 

473 collections = self.getCollections(datasetType.name) 

474 except LookupError as err: 

475 self.task.log.warn(str(err)) 

476 continue 

477 try: 

478 self.task.registry.registerRun(collections[0]) 

479 self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer, 

480 run=collections[0]) 

481 except LookupError as err: 

482 raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err 

483 for collection in collections[1:]: 

484 self.task.registry.associate(collection, 

485 [ref for dataset in datasetsForType for ref in dataset.refs]) 

486 

487 def getCollections(self, datasetTypeName: str) -> List[str]: 

488 """Return the set of collections a particular dataset type should be 

489 associated with. 

490 

491 Parameters 

492 ---------- 

493 datasetTypeName : `str` 

494 Name of the dataset type. 

495 

496 Returns 

497 ------- 

498 collections : `list` of `str` 

499 Collections the dataset should be associated with. The first 

500 item in the list is the run the dataset should be added to 

501 initially. 

502 """ 

503 if datasetTypeName in self.task.config.collections: 

504 return [self.task.config.collections[datasetTypeName]] + self._collections 

505 elif self._collections: 

506 return self._collections 

507 else: 

508 raise LookupError("No collection configured for dataset type {datasetTypeName}.") 

509 

510 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping 

511 ) -> Optional[StorageClass]: 

512 """Infer the Gen3 `StorageClass` from a dataset from a combination of 

513 configuration and Gen2 dataset type information. 

514 

515 datasetTypeName: `str` 

516 Name of the dataset type. 

517 mapping : `lsst.obs.base.mapping.Mapping` 

518 Mapping object used by the Gen2 `CameraMapper` to describe the 

519 dataset type. 

520 """ 

521 storageClassName = self.task.config.storageClasses.get(datasetTypeName) 

522 if storageClassName is None and mapping.python is not None: 

523 storageClassName = self.task.config.storageClasses.get(mapping.python, None) 

524 if storageClassName is None and mapping.persistable is not None: 

525 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None) 

526 if storageClassName is None and mapping.python is not None: 

527 unqualified = mapping.python.split(".")[-1] 

528 storageClassName = self.task.config.storageClasses.get(unqualified, None) 

529 if storageClassName is not None: 

530 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName) 

531 else: 

532 try: 

533 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable) 

534 except KeyError: 

535 storageClass = None 

536 if storageClass is None and mapping.python is not None: 

537 try: 

538 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified) 

539 except KeyError: 

540 pass 

541 if storageClass is None: 

542 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName) 

543 else: 

544 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName) 

545 return storageClass 

546 

547 # Class attributes that will be shadowed by public instance attributes; 

548 # defined here only for documentation purposes. 

549 

550 task: ConvertRepoTask 

551 """The parent task that constructed and uses this converter 

552 (`ConvertRepoTask`). 

553 """ 

554 

555 root: str 

556 """Root path to the Gen2 repository this converter manages (`str`). 

557 

558 This is a complete path, not relative to some other repository root. 

559 """ 

560 

561 subset: Optional[ConversionSubset] 

562 """An object that represents a filter to be applied to the datasets that 

563 are converted (`ConversionSubset` or `None`). 

564 """