Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RepoConverter"] 

24 

25from dataclasses import dataclass 

26from collections import defaultdict 

27from abc import ABC, abstractmethod 

28import fnmatch 

29import os.path 

30import re 

31from typing import ( 

32 Dict, 

33 Iterator, 

34 List, 

35 MutableMapping, 

36 Optional, 

37 Set, 

38 Tuple, 

39 Union, 

40 TYPE_CHECKING, 

41) 

42 

43from lsst.utils import doImport 

44from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType 

45from lsst.sphgeom import RangeSet, Region 

46from .repoWalker import RepoWalker 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping 

50 from .convertRepo import ConvertRepoTask 

51 from .scanner import PathElementHandler 

52 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension, FormatterParameter 

53 from .._instrument import Instrument 

54 

55 

56@dataclass 

57class ConversionSubset: 

58 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 

59 lists of related data ID values that should be included in the conversion. 

60 

61 Parameters 

62 ---------- 

63 instrument : `str` 

64 Instrument name used in Gen3 data IDs. 

65 visits : `set` of `int` 

66 Visit IDs that define the filter. 

67 """ 

68 

69 def __init__(self, instrument: str, visits: Set[int]): 

70 self.instrument = instrument 

71 self.visits = visits 

72 self.regions = None 

73 self.tracts = {} 

74 self.skypix = {} 

75 

76 def addSkyMap(self, registry: Registry, name: str): 

77 """Populate the included tract IDs for the given skymap from those that 

78 overlap the visits the `ConversionSubset` was initialized with. 

79 

80 Parameters 

81 ---------- 

82 registry : `lsst.daf.butler.Registry` 

83 Registry that can be queried for visit/tract overlaps. 

84 name : `str` 

85 SkyMap name used in Gen3 data IDs. 

86 """ 

87 tracts = set() 

88 self.tracts[name] = tracts 

89 for visit in self.visits: 

90 for dataId in registry.queryDataIds(["tract"], 

91 dataId={"skymap": name, 

92 "instrument": self.instrument, 

93 "visit": visit}): 

94 tracts.add(dataId["tract"]) 

95 

96 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension): 

97 """Populate the included skypix IDs for the given dimension from those 

98 that overlap the visits the `ConversionSubset` was initialized with. 

99 

100 Parameters 

101 ---------- 

102 registry : `lsst.daf.butler.Registry` 

103 Registry that can be queried for visit regions. 

104 name : `str` 

105 SkyMap name used in Gen3 data IDs. 

106 """ 

107 if self.regions is None: 

108 self.regions = [] 

109 for visit in self.visits: 

110 dataId = registry.expandDataId(instrument=self.instrument, visit=visit) 

111 self.regions.append(dataId.region) 

112 ranges = RangeSet() 

113 for region in self.regions: 

114 ranges = ranges.union(dimension.pixelization.envelope(region)) 

115 self.skypix[dimension] = ranges 

116 

117 def isRelated(self, dataId: DataCoordinate) -> bool: 

118 """Test whether the given data ID is related to this subset and hence 

119 should be included in a repository conversion. 

120 

121 Parameters 

122 ---------- 

123 dataId : `lsst.daf.butler.DataCoordinate` 

124 Data ID to test. 

125 

126 Returns 

127 ------- 

128 related : `bool` 

129 `True` if this data ID should be included in a repository 

130 conversion. 

131 

132 Notes 

133 ----- 

134 More formally, this tests that the given data ID is not unrelated; 

135 if a data ID does not involve tracts, visits, or skypix dimensions, 

136 we always include it. 

137 """ 

138 if self.visits is None: 

139 # We're not filtering at all. 

140 return True 

141 if "visit" in dataId.graph and dataId["visit"] not in self.visits: 

142 return False 

143 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]: 

144 return False 

145 for dimension, ranges in self.skypix.items(): 

146 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]): 

147 return False 

148 return True 

149 

150 # Class attributes that will be shadowed by public instance attributes; 

151 # defined here only for documentation purposes. 

152 

153 instrument: str 

154 """The name of the instrument, as used in Gen3 data IDs (`str`). 

155 """ 

156 

157 visits: Set[int] 

158 """The set of visit IDs that should be included in the conversion (`set` 

159 of `int`). 

160 """ 

161 

162 regions: Optional[List[Region]] 

163 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 

164 

165 Set to `None` before it has been initialized. Any code that attempts to 

166 use it when it is `None` has a logic bug. 

167 """ 

168 

169 tracts: Dict[str, Set[int]] 

170 """Tracts that should be included in the conversion, grouped by skymap 

171 name (`dict` mapping `str` to `set` of `int`). 

172 """ 

173 

174 skypix: Dict[SkyPixDimension, RangeSet] 

175 """SkyPix ranges that should be included in the conversion, grouped by 

176 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 

177 """ 

178 

179 

180class RepoConverter(ABC): 

181 """An abstract base class for objects that help `ConvertRepoTask` convert 

182 datasets from a single Gen2 repository. 

183 

184 Parameters 

185 ---------- 

186 task : `ConvertRepoTask` 

187 Task instance that is using this helper object. 

188 root : `str` 

189 Root of the Gen2 repo being converted. Will be converted to an 

190 absolute path, resolving symbolic links and ``~``, if necessary. 

191 instrument : `Instrument` 

192 Gen3 instrument class to use for this conversion. 

193 collections : `list` of `str` 

194 Gen3 collections with which all converted datasets should be 

195 associated. 

196 subset : `ConversionSubset, optional 

197 Helper object that implements a filter that restricts the data IDs that 

198 are converted. 

199 

200 Notes 

201 ----- 

202 `RepoConverter` defines the only public API users of its subclasses should 

203 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 

204 several abstract methods that subclasses must implement. In some cases, 

205 subclasses may reimplement the public methods as well, but are expected to 

206 delegate to ``super()`` either at the beginning or end of their own 

207 implementation. 

208 """ 

209 

210 def __init__(self, *, task: ConvertRepoTask, root: str, instrument: Instrument, run: Optional[str], 

211 subset: Optional[ConversionSubset] = None): 

212 self.task = task 

213 self.root = os.path.realpath(os.path.expanduser(root)) 

214 self.instrument = instrument 

215 self.subset = subset 

216 self._run = run 

217 self._repoWalker = None # Created in prep 

218 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list) 

219 

220 @abstractmethod 

221 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

222 """Test whether the given dataset is handled specially by this 

223 converter and hence should be ignored by generic base-class logic that 

224 searches for dataset types to convert. 

225 

226 Parameters 

227 ---------- 

228 datasetTypeName : `str` 

229 Name of the dataset type to test. 

230 

231 Returns 

232 ------- 

233 special : `bool` 

234 `True` if the dataset type is special. 

235 """ 

236 raise NotImplementedError() 

237 

238 @abstractmethod 

239 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]: 

240 """Iterate over all `CameraMapper` `Mapping` objects that should be 

241 considered for conversion by this repository. 

242 

243 This this should include any datasets that may appear in the 

244 repository, including those that are special (see 

245 `isDatasetTypeSpecial`) and those that are being ignored (see 

246 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 

247 to identify and hence skip these datasets quietly instead of warning 

248 about them as unrecognized. 

249 

250 Yields 

251 ------ 

252 datasetTypeName: `str` 

253 Name of the dataset type. 

254 mapping : `lsst.obs.base.mapping.Mapping` 

255 Mapping object used by the Gen2 `CameraMapper` to describe the 

256 dataset type. 

257 """ 

258 raise NotImplementedError() 

259 

260 @abstractmethod 

261 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type], 

262 storageClass: StorageClass, 

263 formatter: FormatterParameter = None, 

264 targetHandler: Optional[PathElementHandler] = None, 

265 ) -> RepoWalker.Target: 

266 """Make a struct that identifies a dataset type to be extracted by 

267 walking the repo directory structure. 

268 

269 Parameters 

270 ---------- 

271 datasetTypeName : `str` 

272 Name of the dataset type (the same in both Gen2 and Gen3). 

273 template : `str` 

274 The full Gen2 filename template. 

275 keys : `dict` [`str`, `type`] 

276 A dictionary mapping Gen2 data ID key to the type of its value. 

277 storageClass : `lsst.daf.butler.StorageClass` 

278 Gen3 storage class for this dataset type. 

279 formatter : `lsst.daf.butler.Formatter` or `str`, optional 

280 A Gen 3 formatter class or fully-qualified name. 

281 targetHandler : `PathElementHandler`, optional 

282 Specialist target handler to use for this dataset type. 

283 

284 Returns 

285 ------- 

286 target : `RepoWalker.Target` 

287 A struct containing information about the target dataset (much of 

288 it simplify forwarded from the arguments). 

289 """ 

290 raise NotImplementedError() 

291 

292 def getSpecialDirectories(self) -> List[str]: 

293 """Return a list of directory paths that should not be searched for 

294 files. 

295 

296 These may be directories that simply do not contain datasets (or 

297 contain datasets in another repository), or directories whose datasets 

298 are handled specially by a subclass. 

299 

300 Returns 

301 ------- 

302 directories : `list` [`str`] 

303 The full paths of directories to skip, relative to the repository 

304 root. 

305 """ 

306 return [] 

307 

308 def prep(self): 

309 """Perform preparatory work associated with the dataset types to be 

310 converted from this repository (but not the datasets themselves). 

311 

312 Notes 

313 ----- 

314 This should be a relatively fast operation that should not depend on 

315 the size of the repository. 

316 

317 Subclasses may override this method, but must delegate to the base 

318 class implementation at some point in their own logic. 

319 More often, subclasses will specialize the behavior of `prep` by 

320 overriding other methods to which the base class implementation 

321 delegates. These include: 

322 - `iterMappings` 

323 - `isDatasetTypeSpecial` 

324 - `getSpecialDirectories` 

325 - `makeRepoWalkerTarget` 

326 

327 This should not perform any write operations to the Gen3 repository. 

328 It is guaranteed to be called before `insertDimensionData`. 

329 """ 

330 self.task.log.info(f"Preparing other dataset types from root {self.root}.") 

331 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = [] 

332 for datasetTypeName, mapping in self.iterMappings(): 

333 try: 

334 template = mapping.template 

335 except RuntimeError: 

336 # No template for this dataset in this mapper, so there's no 

337 # way there should be instances of this dataset in this repo. 

338 continue 

339 extensions = [""] 

340 skip = False 

341 message = None 

342 storageClass = None 

343 if (not self.task.isDatasetTypeIncluded(datasetTypeName) 

344 or self.isDatasetTypeSpecial(datasetTypeName)): 

345 # User indicated not to include this data, but we still want 

346 # to recognize files of that type to avoid warning about them. 

347 skip = True 

348 else: 

349 storageClass = self._guessStorageClass(datasetTypeName, mapping) 

350 if storageClass is None: 

351 # This may be a problem, but only if we actually encounter any 

352 # files corresponding to this dataset. Of course, we need 

353 # to be able to parse those files in order to recognize that 

354 # situation. 

355 message = f"no storage class found for {datasetTypeName}" 

356 skip = True 

357 # Handle files that are compressed on disk, but the gen2 template is just `.fits` 

358 if template.endswith(".fits"): 

359 extensions.extend((".gz", ".fz")) 

360 for extension in extensions: 

361 if skip: 

362 walkerInput = RepoWalker.Skip( 

363 template=template+extension, 

364 keys=mapping.keys(), 

365 message=message, 

366 ) 

367 self.task.log.debug("Skipping template in walker: %s", template) 

368 else: 

369 assert message is None 

370 targetHandler = self.task.config.targetHandlerClasses.get(datasetTypeName) 

371 if targetHandler is not None: 

372 targetHandler = doImport(targetHandler) 

373 walkerInput = self.makeRepoWalkerTarget( 

374 datasetTypeName=datasetTypeName, 

375 template=template+extension, 

376 keys=mapping.keys(), 

377 storageClass=storageClass, 

378 formatter=self.task.config.formatterClasses.get(datasetTypeName), 

379 targetHandler=targetHandler, 

380 ) 

381 self.task.log.debug("Adding template to walker: %s + %s, for %s", template, extension, 

382 walkerInput.datasetType) 

383 walkerInputs.append(walkerInput) 

384 

385 for dirPath in self.getSpecialDirectories(): 

386 walkerInputs.append( 

387 RepoWalker.Skip( 

388 template=dirPath, # not really a template, but that's fine; it's relative to root. 

389 keys={}, 

390 message=None, 

391 isForFiles=True, 

392 ) 

393 ) 

394 fileIgnoreRegExTerms = [] 

395 for pattern in self.task.config.fileIgnorePatterns: 

396 fileIgnoreRegExTerms.append(fnmatch.translate(pattern)) 

397 if fileIgnoreRegExTerms: 

398 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms)) 

399 else: 

400 fileIgnoreRegEx = None 

401 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx, 

402 log=self.task.log.getChild("repoWalker")) 

403 

404 def iterDatasets(self) -> Iterator[FileDataset]: 

405 """Iterate over datasets in the repository that should be ingested into 

406 the Gen3 repository. 

407 

408 The base class implementation yields nothing; the datasets handled by 

409 the `RepoConverter` base class itself are read directly in 

410 `findDatasets`. 

411 

412 Subclasses should override this method if they support additional 

413 datasets that are handled some other way. 

414 

415 Yields 

416 ------ 

417 dataset : `FileDataset` 

418 Structures representing datasets to be ingested. Paths should be 

419 absolute. 

420 """ 

421 yield from () 

422 

423 def findDatasets(self): 

424 assert self._repoWalker, "prep() must be called before findDatasets." 

425 self.task.log.info("Adding special datasets in repo %s.", self.root) 

426 for dataset in self.iterDatasets(): 

427 assert len(dataset.refs) == 1 

428 self._fileDatasets[dataset.refs[0].datasetType].append(dataset) 

429 self.task.log.info("Finding datasets from files in repo %s.", self.root) 

430 self._fileDatasets.update( 

431 self._repoWalker.walk( 

432 self.root, 

433 predicate=(self.subset.isRelated if self.subset is not None else None) 

434 ) 

435 ) 

436 

437 def insertDimensionData(self): 

438 """Insert any dimension records uniquely derived from this repository 

439 into the registry. 

440 

441 Subclasses may override this method, but may not need to; the default 

442 implementation does nothing. 

443 

444 SkyMap and SkyPix dimensions should instead be handled by calling 

445 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 

446 these dimensions are in general shared by multiple Gen2 repositories. 

447 

448 This method is guaranteed to be called between `prep` and 

449 `expandDataIds`. 

450 """ 

451 pass 

452 

453 def expandDataIds(self): 

454 """Expand the data IDs for all datasets to be inserted. 

455 

456 Subclasses may override this method, but must delegate to the base 

457 class implementation if they do. 

458 

459 This involves queries to the registry, but not writes. It is 

460 guaranteed to be called between `insertDimensionData` and `ingest`. 

461 """ 

462 import itertools 

463 for datasetType, datasetsForType in self._fileDatasets.items(): 

464 self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType), 

465 datasetType.name) 

466 expanded = [] 

467 for dataset in datasetsForType: 

468 for i, ref in enumerate(dataset.refs): 

469 try: 

470 dataId = self.task.registry.expandDataId(ref.dataId) 

471 dataset.refs[i] = ref.expanded(dataId) 

472 except LookupError as err: 

473 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err) 

474 # Remove skipped datasets from multi-extension FileDatasets 

475 dataset.refs[i] = None # We will strip off the `None`s after the loop. 

476 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs) 

477 if dataset.refs: 

478 expanded.append(dataset) 

479 

480 datasetsForType[:] = expanded 

481 

482 def ingest(self): 

483 """Insert converted datasets into the Gen3 repository. 

484 

485 Subclasses may override this method, but must delegate to the base 

486 class implementation at some point in their own logic. 

487 

488 This method is guaranteed to be called after `expandDataIds`. 

489 """ 

490 for datasetType, datasetsForType in self._fileDatasets.items(): 

491 self.task.registry.registerDatasetType(datasetType) 

492 try: 

493 run = self.getRun(datasetType.name) 

494 except LookupError: 

495 self.task.log.warn(f"No run configured for dataset type {datasetType.name}.") 

496 continue 

497 self.task.log.info("Ingesting %s %s datasets into run %s.", len(datasetsForType), 

498 datasetType.name, run) 

499 try: 

500 self.task.registry.registerRun(run) 

501 self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer, run=run) 

502 except LookupError as err: 

503 raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err 

504 

505 def getRun(self, datasetTypeName: str) -> str: 

506 """Return the name of the run to insert instances of the given dataset 

507 type into in this collection. 

508 

509 Parameters 

510 ---------- 

511 datasetTypeName : `str` 

512 Name of the dataset type. 

513 

514 Returns 

515 ------- 

516 run : `str` 

517 Name of the `~lsst.daf.butler.CollectionType.RUN` collection. 

518 """ 

519 assert self._run is not None, "Method must be overridden if self._run is allowed to be None" 

520 return self._run 

521 

522 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping 

523 ) -> Optional[StorageClass]: 

524 """Infer the Gen3 `StorageClass` from a dataset from a combination of 

525 configuration and Gen2 dataset type information. 

526 

527 datasetTypeName: `str` 

528 Name of the dataset type. 

529 mapping : `lsst.obs.base.mapping.Mapping` 

530 Mapping object used by the Gen2 `CameraMapper` to describe the 

531 dataset type. 

532 """ 

533 storageClassName = self.task.config.storageClasses.get(datasetTypeName) 

534 if storageClassName is None and mapping.python is not None: 

535 storageClassName = self.task.config.storageClasses.get(mapping.python, None) 

536 if storageClassName is None and mapping.persistable is not None: 

537 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None) 

538 if storageClassName is None and mapping.python is not None: 

539 unqualified = mapping.python.split(".")[-1] 

540 storageClassName = self.task.config.storageClasses.get(unqualified, None) 

541 if storageClassName is not None: 

542 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName) 

543 else: 

544 try: 

545 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable) 

546 except KeyError: 

547 storageClass = None 

548 if storageClass is None and mapping.python is not None: 

549 try: 

550 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified) 

551 except KeyError: 

552 pass 

553 if storageClass is None: 

554 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName) 

555 else: 

556 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName) 

557 return storageClass 

558 

559 # Class attributes that will be shadowed by public instance attributes; 

560 # defined here only for documentation purposes. 

561 

562 task: ConvertRepoTask 

563 """The parent task that constructed and uses this converter 

564 (`ConvertRepoTask`). 

565 """ 

566 

567 root: str 

568 """Root path to the Gen2 repository this converter manages (`str`). 

569 

570 This is a complete path, not relative to some other repository root. 

571 """ 

572 

573 subset: Optional[ConversionSubset] 

574 """An object that represents a filter to be applied to the datasets that 

575 are converted (`ConversionSubset` or `None`). 

576 """