Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RepoConverter"] 

24 

25from dataclasses import dataclass 

26from collections import defaultdict 

27from abc import ABC, abstractmethod 

28import fnmatch 

29import re 

30from typing import ( 

31 Dict, 

32 Iterator, 

33 List, 

34 MutableMapping, 

35 Optional, 

36 Set, 

37 Tuple, 

38 Union, 

39 TYPE_CHECKING, 

40) 

41 

42from lsst.utils import doImport 

43from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType 

44from lsst.sphgeom import RangeSet, Region 

45from .repoWalker import RepoWalker 

46 

47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true

48 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping 

49 from .convertRepo import ConvertRepoTask 

50 from .scanner import PathElementHandler 

51 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension, FormatterParameter 

52 

53 

54@dataclass 

55class ConversionSubset: 

56 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 

57 lists of related data ID values that should be included in the conversion. 

58 

59 Parameters 

60 ---------- 

61 instrument : `str` 

62 Instrument name used in Gen3 data IDs. 

63 visits : `set` of `int` 

64 Visit IDs that define the filter. 

65 """ 

66 

67 def __init__(self, instrument: str, visits: Set[int]): 

68 self.instrument = instrument 

69 self.visits = visits 

70 self.regions = None 

71 self.tracts = {} 

72 self.skypix = {} 

73 

74 def addSkyMap(self, registry: Registry, name: str): 

75 """Populate the included tract IDs for the given skymap from those that 

76 overlap the visits the `ConversionSubset` was initialized with. 

77 

78 Parameters 

79 ---------- 

80 registry : `lsst.daf.butler.Registry` 

81 Registry that can be queried for visit/tract overlaps. 

82 name : `str` 

83 SkyMap name used in Gen3 data IDs. 

84 """ 

85 tracts = set() 

86 self.tracts[name] = tracts 

87 for visit in self.visits: 

88 for dataId in registry.queryDimensions(["tract"], expand=False, 

89 dataId={"skymap": name, 

90 "instrument": self.instrument, 

91 "visit": visit}): 

92 tracts.add(dataId["tract"]) 

93 

94 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension): 

95 """Populate the included skypix IDs for the given dimension from those 

96 that overlap the visits the `ConversionSubset` was initialized with. 

97 

98 Parameters 

99 ---------- 

100 registry : `lsst.daf.butler.Registry` 

101 Registry that can be queried for visit regions. 

102 name : `str` 

103 SkyMap name used in Gen3 data IDs. 

104 """ 

105 if self.regions is None: 

106 self.regions = [] 

107 for visit in self.visits: 

108 dataId = registry.expandDataId(instrument=self.instrument, visit=visit) 

109 self.regions.append(dataId.region) 

110 ranges = RangeSet() 

111 for region in self.regions: 

112 ranges = ranges.union(dimension.pixelization.envelope(region)) 

113 self.skypix[dimension] = ranges 

114 

115 def isRelated(self, dataId: DataCoordinate) -> bool: 

116 """Test whether the given data ID is related to this subset and hence 

117 should be included in a repository conversion. 

118 

119 Parameters 

120 ---------- 

121 dataId : `lsst.daf.butler.DataCoordinate` 

122 Data ID to test. 

123 

124 Returns 

125 ------- 

126 related : `bool` 

127 `True` if this data ID should be included in a repository 

128 conversion. 

129 

130 Notes 

131 ----- 

132 More formally, this tests that the given data ID is not unrelated; 

133 if a data ID does not involve tracts, visits, or skypix dimensions, 

134 we always include it. 

135 """ 

136 if self.visits is None: 

137 # We're not filtering at all. 

138 return True 

139 if "visit" in dataId.graph and dataId["visit"] not in self.visits: 

140 return False 

141 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]: 

142 return False 

143 for dimension, ranges in self.skypix.items(): 

144 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]): 

145 return False 

146 return True 

147 

148 # Class attributes that will be shadowed by public instance attributes; 

149 # defined here only for documentation purposes. 

150 

151 instrument: str 

152 """The name of the instrument, as used in Gen3 data IDs (`str`). 

153 """ 

154 

155 visits: Set[int] 

156 """The set of visit IDs that should be included in the conversion (`set` 

157 of `int`). 

158 """ 

159 

160 regions: Optional[List[Region]] 

161 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 

162 

163 Set to `None` before it has been initialized. Any code that attempts to 

164 use it when it is `None` has a logic bug. 

165 """ 

166 

167 tracts: Dict[str, Set[int]] 

168 """Tracts that should be included in the conversion, grouped by skymap 

169 name (`dict` mapping `str` to `set` of `int`). 

170 """ 

171 

172 skypix: Dict[SkyPixDimension, RangeSet] 

173 """SkyPix ranges that should be included in the conversion, grouped by 

174 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 

175 """ 

176 

177 

178class RepoConverter(ABC): 

179 """An abstract base class for objects that help `ConvertRepoTask` convert 

180 datasets from a single Gen2 repository. 

181 

182 Parameters 

183 ---------- 

184 task : `ConvertRepoTask` 

185 Task instance that is using this helper object. 

186 root : `str` 

187 Root of the Gen2 repo being converted. 

188 collections : `list` of `str` 

189 Gen3 collections with which all converted datasets should be 

190 associated. 

191 subset : `ConversionSubset, optional 

192 Helper object that implements a filter that restricts the data IDs that 

193 are converted. 

194 

195 Notes 

196 ----- 

197 `RepoConverter` defines the only public API users of its subclasses should 

198 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 

199 several abstract methods that subclasses must implement. In some cases, 

200 subclasses may reimplement the public methods as well, but are expected to 

201 delegate to ``super()`` either at the beginning or end of their own 

202 implementation. 

203 """ 

204 

205 def __init__(self, *, task: ConvertRepoTask, root: str, run: Optional[str], 

206 subset: Optional[ConversionSubset] = None): 

207 self.task = task 

208 self.root = root 

209 self.subset = subset 

210 self._run = run 

211 self._repoWalker = None # Created in prep 

212 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list) 

213 

214 @abstractmethod 

215 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

216 """Test whether the given dataset is handled specially by this 

217 converter and hence should be ignored by generic base-class logic that 

218 searches for dataset types to convert. 

219 

220 Parameters 

221 ---------- 

222 datasetTypeName : `str` 

223 Name of the dataset type to test. 

224 

225 Returns 

226 ------- 

227 special : `bool` 

228 `True` if the dataset type is special. 

229 """ 

230 raise NotImplementedError() 

231 

232 @abstractmethod 

233 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]: 

234 """Iterate over all `CameraMapper` `Mapping` objects that should be 

235 considered for conversion by this repository. 

236 

237 This this should include any datasets that may appear in the 

238 repository, including those that are special (see 

239 `isDatasetTypeSpecial`) and those that are being ignored (see 

240 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 

241 to identify and hence skip these datasets quietly instead of warning 

242 about them as unrecognized. 

243 

244 Yields 

245 ------ 

246 datasetTypeName: `str` 

247 Name of the dataset type. 

248 mapping : `lsst.obs.base.mapping.Mapping` 

249 Mapping object used by the Gen2 `CameraMapper` to describe the 

250 dataset type. 

251 """ 

252 raise NotImplementedError() 

253 

254 @abstractmethod 

255 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type], 

256 storageClass: StorageClass, 

257 formatter: FormatterParameter = None, 

258 targetHandler: Optional[PathElementHandler] = None, 

259 ) -> RepoWalker.Target: 

260 """Make a struct that identifies a dataset type to be extracted by 

261 walking the repo directory structure. 

262 

263 Parameters 

264 ---------- 

265 datasetTypeName : `str` 

266 Name of the dataset type (the same in both Gen2 and Gen3). 

267 template : `str` 

268 The full Gen2 filename template. 

269 keys : `dict` [`str`, `type`] 

270 A dictionary mapping Gen2 data ID key to the type of its value. 

271 storageClass : `lsst.daf.butler.StorageClass` 

272 Gen3 storage class for this dataset type. 

273 formatter : `lsst.daf.butler.Formatter` or `str`, optional 

274 A Gen 3 formatter class or fully-qualified name. 

275 targetHandler : `PathElementHandler`, optional 

276 Specialist target handler to use for this dataset type. 

277 

278 Returns 

279 ------- 

280 target : `RepoWalker.Target` 

281 A struct containing information about the target dataset (much of 

282 it simplify forwarded from the arguments). 

283 """ 

284 raise NotImplementedError() 

285 

286 def getSpecialDirectories(self) -> List[str]: 

287 """Return a list of directory paths that should not be searched for 

288 files. 

289 

290 These may be directories that simply do not contain datasets (or 

291 contain datasets in another repository), or directories whose datasets 

292 are handled specially by a subclass. 

293 

294 Returns 

295 ------- 

296 directories : `list` [`str`] 

297 The full paths of directories to skip, relative to the repository 

298 root. 

299 """ 

300 return [] 

301 

302 def prep(self): 

303 """Perform preparatory work associated with the dataset types to be 

304 converted from this repository (but not the datasets themselves). 

305 

306 Notes 

307 ----- 

308 This should be a relatively fast operation that should not depend on 

309 the size of the repository. 

310 

311 Subclasses may override this method, but must delegate to the base 

312 class implementation at some point in their own logic. 

313 More often, subclasses will specialize the behavior of `prep` by 

314 overriding other methods to which the base class implementation 

315 delegates. These include: 

316 - `iterMappings` 

317 - `isDatasetTypeSpecial` 

318 - `getSpecialDirectories` 

319 - `makeRepoWalkerTarget` 

320 

321 This should not perform any write operations to the Gen3 repository. 

322 It is guaranteed to be called before `insertDimensionData`. 

323 """ 

324 self.task.log.info(f"Preparing other dataset types from root {self.root}.") 

325 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = [] 

326 for datasetTypeName, mapping in self.iterMappings(): 

327 try: 

328 template = mapping.template 

329 except RuntimeError: 

330 # No template for this dataset in this mapper, so there's no 

331 # way there should be instances of this dataset in this repo. 

332 continue 

333 extensions = [""] 

334 skip = False 

335 message = None 

336 storageClass = None 

337 if (not self.task.isDatasetTypeIncluded(datasetTypeName) 

338 or self.isDatasetTypeSpecial(datasetTypeName)): 

339 # User indicated not to include this data, but we still want 

340 # to recognize files of that type to avoid warning about them. 

341 skip = True 

342 else: 

343 storageClass = self._guessStorageClass(datasetTypeName, mapping) 

344 if storageClass is None: 

345 # This may be a problem, but only if we actually encounter any 

346 # files corresponding to this dataset. Of course, we need 

347 # to be able to parse those files in order to recognize that 

348 # situation. 

349 message = f"no storage class found for {datasetTypeName}" 

350 skip = True 

351 # Handle files that are compressed on disk, but the gen2 template is just `.fits` 

352 if template.endswith(".fits"): 

353 extensions.extend((".gz", ".fz")) 

354 for extension in extensions: 

355 if skip: 

356 walkerInput = RepoWalker.Skip( 

357 template=template+extension, 

358 keys=mapping.keys(), 

359 message=message, 

360 ) 

361 self.task.log.debug("Skipping template in walker: %s", template) 

362 else: 

363 assert message is None 

364 targetHandler = self.task.config.targetHandlerClasses.get(datasetTypeName) 

365 if targetHandler is not None: 

366 targetHandler = doImport(targetHandler) 

367 walkerInput = self.makeRepoWalkerTarget( 

368 datasetTypeName=datasetTypeName, 

369 template=template+extension, 

370 keys=mapping.keys(), 

371 storageClass=storageClass, 

372 formatter=self.task.config.formatterClasses.get(datasetTypeName), 

373 targetHandler=targetHandler, 

374 ) 

375 self.task.log.debug("Adding template to walker: %s + %s, for %s", template, extension, 

376 walkerInput.datasetType) 

377 walkerInputs.append(walkerInput) 

378 

379 for dirPath in self.getSpecialDirectories(): 

380 walkerInputs.append( 

381 RepoWalker.Skip( 

382 template=dirPath, # not really a template, but that's fine; it's relative to root. 

383 keys={}, 

384 message=None, 

385 isForFiles=True, 

386 ) 

387 ) 

388 fileIgnoreRegExTerms = [] 

389 for pattern in self.task.config.fileIgnorePatterns: 

390 fileIgnoreRegExTerms.append(fnmatch.translate(pattern)) 

391 if fileIgnoreRegExTerms: 

392 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms)) 

393 else: 

394 fileIgnoreRegEx = None 

395 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx, 

396 log=self.task.log.getChild("repoWalker")) 

397 

398 def iterDatasets(self) -> Iterator[FileDataset]: 

399 """Iterate over datasets in the repository that should be ingested into 

400 the Gen3 repository. 

401 

402 The base class implementation yields nothing; the datasets handled by 

403 the `RepoConverter` base class itself are read directly in 

404 `findDatasets`. 

405 

406 Subclasses should override this method if they support additional 

407 datasets that are handled some other way. 

408 

409 Yields 

410 ------ 

411 dataset : `FileDataset` 

412 Structures representing datasets to be ingested. Paths should be 

413 absolute. 

414 """ 

415 yield from () 

416 

417 def findDatasets(self): 

418 assert self._repoWalker, "prep() must be called before findDatasets." 

419 self.task.log.info("Adding special datasets in repo %s.", self.root) 

420 for dataset in self.iterDatasets(): 

421 assert len(dataset.refs) == 1 

422 self._fileDatasets[dataset.refs[0].datasetType].append(dataset) 

423 self.task.log.info("Finding datasets from files in repo %s.", self.root) 

424 self._fileDatasets.update( 

425 self._repoWalker.walk( 

426 self.root, 

427 predicate=(self.subset.isRelated if self.subset is not None else None) 

428 ) 

429 ) 

430 

431 def insertDimensionData(self): 

432 """Insert any dimension records uniquely derived from this repository 

433 into the registry. 

434 

435 Subclasses may override this method, but may not need to; the default 

436 implementation does nothing. 

437 

438 SkyMap and SkyPix dimensions should instead be handled by calling 

439 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 

440 these dimensions are in general shared by multiple Gen2 repositories. 

441 

442 This method is guaranteed to be called between `prep` and 

443 `expandDataIds`. 

444 """ 

445 pass 

446 

447 def expandDataIds(self): 

448 """Expand the data IDs for all datasets to be inserted. 

449 

450 Subclasses may override this method, but must delegate to the base 

451 class implementation if they do. 

452 

453 This involves queries to the registry, but not writes. It is 

454 guaranteed to be called between `insertDimensionData` and `ingest`. 

455 """ 

456 import itertools 

457 for datasetType, datasetsForType in self._fileDatasets.items(): 

458 self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType), 

459 datasetType.name) 

460 expanded = [] 

461 for dataset in datasetsForType: 

462 for i, ref in enumerate(dataset.refs): 

463 try: 

464 dataId = self.task.registry.expandDataId(ref.dataId) 

465 dataset.refs[i] = ref.expanded(dataId) 

466 except LookupError as err: 

467 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err) 

468 # Remove skipped datasets from multi-extension FileDatasets 

469 dataset.refs[i] = None # We will strip off the `None`s after the loop. 

470 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs) 

471 if dataset.refs: 

472 expanded.append(dataset) 

473 

474 datasetsForType[:] = expanded 

475 

476 def ingest(self): 

477 """Insert converted datasets into the Gen3 repository. 

478 

479 Subclasses may override this method, but must delegate to the base 

480 class implementation at some point in their own logic. 

481 

482 This method is guaranteed to be called after `expandDataIds`. 

483 """ 

484 for datasetType, datasetsForType in self._fileDatasets.items(): 

485 self.task.registry.registerDatasetType(datasetType) 

486 try: 

487 run = self.getRun(datasetType.name) 

488 except LookupError: 

489 self.task.log.warn(f"No run configured for dataset type {datasetType.name}.") 

490 continue 

491 self.task.log.info("Ingesting %s %s datasets into run %s.", len(datasetsForType), 

492 datasetType.name, run) 

493 try: 

494 self.task.registry.registerRun(run) 

495 self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer, run=run) 

496 except LookupError as err: 

497 raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err 

498 

499 def getRun(self, datasetTypeName: str) -> str: 

500 """Return the name of the run to insert instances of the given dataset 

501 type into in this collection. 

502 

503 Parameters 

504 ---------- 

505 datasetTypeName : `str` 

506 Name of the dataset type. 

507 

508 Returns 

509 ------- 

510 run : `str` 

511 Name of the `~lsst.daf.butler.CollectionType.RUN` collection. 

512 """ 

513 assert self._run is not None, "Method must be overridden if self._run is allowed to be None" 

514 return self._run 

515 

516 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping 

517 ) -> Optional[StorageClass]: 

518 """Infer the Gen3 `StorageClass` from a dataset from a combination of 

519 configuration and Gen2 dataset type information. 

520 

521 datasetTypeName: `str` 

522 Name of the dataset type. 

523 mapping : `lsst.obs.base.mapping.Mapping` 

524 Mapping object used by the Gen2 `CameraMapper` to describe the 

525 dataset type. 

526 """ 

527 storageClassName = self.task.config.storageClasses.get(datasetTypeName) 

528 if storageClassName is None and mapping.python is not None: 

529 storageClassName = self.task.config.storageClasses.get(mapping.python, None) 

530 if storageClassName is None and mapping.persistable is not None: 

531 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None) 

532 if storageClassName is None and mapping.python is not None: 

533 unqualified = mapping.python.split(".")[-1] 

534 storageClassName = self.task.config.storageClasses.get(unqualified, None) 

535 if storageClassName is not None: 

536 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName) 

537 else: 

538 try: 

539 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable) 

540 except KeyError: 

541 storageClass = None 

542 if storageClass is None and mapping.python is not None: 

543 try: 

544 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified) 

545 except KeyError: 

546 pass 

547 if storageClass is None: 

548 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName) 

549 else: 

550 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName) 

551 return storageClass 

552 

553 # Class attributes that will be shadowed by public instance attributes; 

554 # defined here only for documentation purposes. 

555 

556 task: ConvertRepoTask 

557 """The parent task that constructed and uses this converter 

558 (`ConvertRepoTask`). 

559 """ 

560 

561 root: str 

562 """Root path to the Gen2 repository this converter manages (`str`). 

563 

564 This is a complete path, not relative to some other repository root. 

565 """ 

566 

567 subset: Optional[ConversionSubset] 

568 """An object that represents a filter to be applied to the datasets that 

569 are converted (`ConversionSubset` or `None`). 

570 """