Coverage for python/lsst/obs/base/gen2to3/repoConverter.py: 18%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

213 statements  

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RepoConverter"] 

24 

25from dataclasses import dataclass 

26from collections import defaultdict 

27from abc import ABC, abstractmethod 

28import fnmatch 

29import os.path 

30import re 

31from typing import ( 

32 Dict, 

33 Iterator, 

34 List, 

35 Mapping, 

36 Optional, 

37 Set, 

38 Tuple, 

39 Union, 

40 TYPE_CHECKING, 

41) 

42 

43from lsst.utils import doImport 

44from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType, Progress 

45from lsst.sphgeom import RangeSet, Region 

46from .repoWalker import RepoWalker 

47from ..ingest import _log_msg_counter 

48 

49 

50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true

51 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping 

52 from .convertRepo import ConvertRepoTask 

53 from .scanner import PathElementHandler 

54 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension, FormatterParameter 

55 from .._instrument import Instrument 

56 

57 

58@dataclass 

59class ConversionSubset: 

60 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 

61 lists of related data ID values that should be included in the conversion. 

62 

63 Parameters 

64 ---------- 

65 instrument : `str` 

66 Instrument name used in Gen3 data IDs. 

67 visits : `set` of `int` 

68 Visit IDs that define the filter. 

69 """ 

70 

71 def __init__(self, instrument: str, visits: Set[int]): 

72 self.instrument = instrument 

73 self.visits = visits 

74 self.regions = None 

75 self.tracts = {} 

76 self.skypix = {} 

77 

78 def addSkyMap(self, registry: Registry, name: str): 

79 """Populate the included tract IDs for the given skymap from those that 

80 overlap the visits the `ConversionSubset` was initialized with. 

81 

82 Parameters 

83 ---------- 

84 registry : `lsst.daf.butler.Registry` 

85 Registry that can be queried for visit/tract overlaps. 

86 name : `str` 

87 SkyMap name used in Gen3 data IDs. 

88 """ 

89 tracts = set() 

90 self.tracts[name] = tracts 

91 for visit in self.visits: 

92 for dataId in registry.queryDataIds(["tract"], 

93 dataId={"skymap": name, 

94 "instrument": self.instrument, 

95 "visit": visit}): 

96 tracts.add(dataId["tract"]) 

97 

98 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension): 

99 """Populate the included skypix IDs for the given dimension from those 

100 that overlap the visits the `ConversionSubset` was initialized with. 

101 

102 Parameters 

103 ---------- 

104 registry : `lsst.daf.butler.Registry` 

105 Registry that can be queried for visit regions. 

106 name : `str` 

107 SkyMap name used in Gen3 data IDs. 

108 """ 

109 if self.regions is None: 

110 self.regions = [] 

111 for visit in self.visits: 

112 dataId = registry.expandDataId(instrument=self.instrument, visit=visit) 

113 self.regions.append(dataId.region) 

114 ranges = RangeSet() 

115 for region in self.regions: 

116 ranges = ranges.union(dimension.pixelization.envelope(region)) 

117 self.skypix[dimension] = ranges 

118 

119 def isRelated(self, dataId: DataCoordinate) -> bool: 

120 """Test whether the given data ID is related to this subset and hence 

121 should be included in a repository conversion. 

122 

123 Parameters 

124 ---------- 

125 dataId : `lsst.daf.butler.DataCoordinate` 

126 Data ID to test. 

127 

128 Returns 

129 ------- 

130 related : `bool` 

131 `True` if this data ID should be included in a repository 

132 conversion. 

133 

134 Notes 

135 ----- 

136 More formally, this tests that the given data ID is not unrelated; 

137 if a data ID does not involve tracts, visits, or skypix dimensions, 

138 we always include it. 

139 """ 

140 if self.visits is None: 

141 # We're not filtering at all. 

142 return True 

143 if "visit" in dataId.graph and dataId["visit"] not in self.visits: 

144 return False 

145 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]: 

146 return False 

147 for dimension, ranges in self.skypix.items(): 

148 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]): 

149 return False 

150 return True 

151 

152 # Class attributes that will be shadowed by public instance attributes; 

153 # defined here only for documentation purposes. 

154 

155 instrument: str 

156 """The name of the instrument, as used in Gen3 data IDs (`str`). 

157 """ 

158 

159 visits: Set[int] 

160 """The set of visit IDs that should be included in the conversion (`set` 

161 of `int`). 

162 """ 

163 

164 regions: Optional[List[Region]] 

165 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 

166 

167 Set to `None` before it has been initialized. Any code that attempts to 

168 use it when it is `None` has a logic bug. 

169 """ 

170 

171 tracts: Dict[str, Set[int]] 

172 """Tracts that should be included in the conversion, grouped by skymap 

173 name (`dict` mapping `str` to `set` of `int`). 

174 """ 

175 

176 skypix: Dict[SkyPixDimension, RangeSet] 

177 """SkyPix ranges that should be included in the conversion, grouped by 

178 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 

179 """ 

180 

181 

182class RepoConverter(ABC): 

183 """An abstract base class for objects that help `ConvertRepoTask` convert 

184 datasets from a single Gen2 repository. 

185 

186 Parameters 

187 ---------- 

188 task : `ConvertRepoTask` 

189 Task instance that is using this helper object. 

190 root : `str` 

191 Root of the Gen2 repo being converted. Will be converted to an 

192 absolute path, resolving symbolic links and ``~``, if necessary. 

193 instrument : `Instrument` 

194 Gen3 instrument class to use for this conversion. 

195 collections : `list` of `str` 

196 Gen3 collections with which all converted datasets should be 

197 associated. 

198 subset : `ConversionSubset, optional 

199 Helper object that implements a filter that restricts the data IDs that 

200 are converted. 

201 

202 Notes 

203 ----- 

204 `RepoConverter` defines the only public API users of its subclasses should 

205 use (`prep` `ingest`, and `finish`). These delegate to several abstract 

206 methods that subclasses must implement. In some cases, subclasses may 

207 reimplement the public methods as well, but are expected to delegate to 

208 ``super()`` either at the beginning or end of their own implementation. 

209 """ 

210 

211 def __init__(self, *, task: ConvertRepoTask, root: str, instrument: Instrument, run: Optional[str], 

212 subset: Optional[ConversionSubset] = None): 

213 self.task = task 

214 self.root = os.path.realpath(os.path.expanduser(root)) 

215 self.instrument = instrument 

216 self.subset = subset 

217 self.progress = Progress("obs.base.gen2to3") 

218 self._run = run 

219 self._repoWalker = None # Created in prep 

220 self._fileDatasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]] \ 

221 = defaultdict(lambda: defaultdict(list)) 

222 self._fileDatasetCount = 0 

223 

224 @abstractmethod 

225 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

226 """Test whether the given dataset is handled specially by this 

227 converter and hence should be ignored by generic base-class logic that 

228 searches for dataset types to convert. 

229 

230 Parameters 

231 ---------- 

232 datasetTypeName : `str` 

233 Name of the dataset type to test. 

234 

235 Returns 

236 ------- 

237 special : `bool` 

238 `True` if the dataset type is special. 

239 """ 

240 raise NotImplementedError() 

241 

242 @abstractmethod 

243 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]: 

244 """Iterate over all `CameraMapper` `Mapping` objects that should be 

245 considered for conversion by this repository. 

246 

247 This this should include any datasets that may appear in the 

248 repository, including those that are special (see 

249 `isDatasetTypeSpecial`) and those that are being ignored (see 

250 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 

251 to identify and hence skip these datasets quietly instead of warning 

252 about them as unrecognized. 

253 

254 Yields 

255 ------ 

256 datasetTypeName: `str` 

257 Name of the dataset type. 

258 mapping : `lsst.obs.base.mapping.Mapping` 

259 Mapping object used by the Gen2 `CameraMapper` to describe the 

260 dataset type. 

261 """ 

262 raise NotImplementedError() 

263 

264 @abstractmethod 

265 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type], 

266 storageClass: StorageClass, 

267 formatter: FormatterParameter = None, 

268 targetHandler: Optional[PathElementHandler] = None, 

269 ) -> RepoWalker.Target: 

270 """Make a struct that identifies a dataset type to be extracted by 

271 walking the repo directory structure. 

272 

273 Parameters 

274 ---------- 

275 datasetTypeName : `str` 

276 Name of the dataset type (the same in both Gen2 and Gen3). 

277 template : `str` 

278 The full Gen2 filename template. 

279 keys : `dict` [`str`, `type`] 

280 A dictionary mapping Gen2 data ID key to the type of its value. 

281 storageClass : `lsst.daf.butler.StorageClass` 

282 Gen3 storage class for this dataset type. 

283 formatter : `lsst.daf.butler.Formatter` or `str`, optional 

284 A Gen 3 formatter class or fully-qualified name. 

285 targetHandler : `PathElementHandler`, optional 

286 Specialist target handler to use for this dataset type. 

287 

288 Returns 

289 ------- 

290 target : `RepoWalker.Target` 

291 A struct containing information about the target dataset (much of 

292 it simplify forwarded from the arguments). 

293 """ 

294 raise NotImplementedError() 

295 

296 def getSpecialDirectories(self) -> List[str]: 

297 """Return a list of directory paths that should not be searched for 

298 files. 

299 

300 These may be directories that simply do not contain datasets (or 

301 contain datasets in another repository), or directories whose datasets 

302 are handled specially by a subclass. 

303 

304 Returns 

305 ------- 

306 directories : `list` [`str`] 

307 The full paths of directories to skip, relative to the repository 

308 root. 

309 """ 

310 return [] 

311 

312 def prep(self): 

313 """Perform preparatory work associated with the dataset types to be 

314 converted from this repository (but not the datasets themselves). 

315 

316 Notes 

317 ----- 

318 This should be a relatively fast operation that should not depend on 

319 the size of the repository. 

320 

321 Subclasses may override this method, but must delegate to the base 

322 class implementation at some point in their own logic. 

323 More often, subclasses will specialize the behavior of `prep` by 

324 overriding other methods to which the base class implementation 

325 delegates. These include: 

326 - `iterMappings` 

327 - `isDatasetTypeSpecial` 

328 - `getSpecialDirectories` 

329 - `makeRepoWalkerTarget` 

330 

331 This should not perform any write operations to the Gen3 repository. 

332 It is guaranteed to be called before `ingest`. 

333 """ 

334 self.task.log.info("Preparing other dataset types from root %s.", self.root) 

335 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = [] 

336 for datasetTypeName, mapping in self.iterMappings(): 

337 try: 

338 template = self.task.config.datasetTemplateOverrides.get(datasetTypeName, mapping.template) 

339 except RuntimeError: 

340 # No template for this dataset in this mapper, so there's no 

341 # way there should be instances of this dataset in this repo. 

342 continue 

343 extensions = [""] 

344 skip = False 

345 message = None 

346 storageClass = None 

347 if (not self.task.isDatasetTypeIncluded(datasetTypeName) 

348 or self.isDatasetTypeSpecial(datasetTypeName)): 

349 # User indicated not to include this data, but we still want 

350 # to recognize files of that type to avoid warning about them. 

351 skip = True 

352 else: 

353 storageClass = self._guessStorageClass(datasetTypeName, mapping) 

354 if storageClass is None: 

355 # This may be a problem, but only if we actually encounter 

356 # any files corresponding to this dataset. Of course, we 

357 # need to be able to parse those files in order to 

358 # recognize that situation. 

359 message = f"no storage class found for {datasetTypeName}" 

360 skip = True 

361 # Handle files that are compressed on disk, but the gen2 template 

362 # is just `.fits` 

363 if template.endswith(".fits"): 

364 extensions.extend((".gz", ".fz")) 

365 for extension in extensions: 

366 if skip: 

367 walkerInput = RepoWalker.Skip( 

368 template=template+extension, 

369 keys=mapping.keys(), 

370 message=message, 

371 ) 

372 self.task.log.debug("Skipping template in walker: %s", template) 

373 else: 

374 assert message is None 

375 targetHandler = self.task.config.targetHandlerClasses.get(datasetTypeName) 

376 if targetHandler is not None: 

377 targetHandler = doImport(targetHandler) 

378 walkerInput = self.makeRepoWalkerTarget( 

379 datasetTypeName=datasetTypeName, 

380 template=template+extension, 

381 keys=mapping.keys(), 

382 storageClass=storageClass, 

383 formatter=self.task.config.formatterClasses.get(datasetTypeName), 

384 targetHandler=targetHandler, 

385 ) 

386 self.task.log.debug("Adding template to walker: %s + %s, for %s", template, extension, 

387 walkerInput.datasetType) 

388 walkerInputs.append(walkerInput) 

389 

390 for dirPath in self.getSpecialDirectories(): 

391 walkerInputs.append( 

392 RepoWalker.Skip( 

393 template=dirPath, # not really a template, but that's fine; it's relative to root. 

394 keys={}, 

395 message=None, 

396 isForFiles=True, 

397 ) 

398 ) 

399 fileIgnoreRegExTerms = [] 

400 for pattern in self.task.config.fileIgnorePatterns: 

401 fileIgnoreRegExTerms.append(fnmatch.translate(pattern)) 

402 if fileIgnoreRegExTerms: 

403 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms)) 

404 else: 

405 fileIgnoreRegEx = None 

406 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx, 

407 log=self.task.log.getChild("repoWalker"), 

408 progress=self.progress) 

409 

410 def iterDatasets(self) -> Iterator[FileDataset]: 

411 """Iterate over datasets in the repository that should be ingested into 

412 the Gen3 repository. 

413 

414 The base class implementation yields nothing; the datasets handled by 

415 the `RepoConverter` base class itself are read directly in 

416 `findDatasets`. 

417 

418 Subclasses should override this method if they support additional 

419 datasets that are handled some other way. 

420 

421 Yields 

422 ------ 

423 dataset : `FileDataset` 

424 Structures representing datasets to be ingested. Paths should be 

425 absolute. 

426 """ 

427 yield from () 

428 

429 def findDatasets(self): 

430 assert self._repoWalker, "prep() must be called before findDatasets." 

431 self.task.log.info("Adding special datasets in repo %s.", self.root) 

432 for dataset in self.iterDatasets(): 

433 assert len(dataset.refs) == 1 

434 # None index below is for calibDate, which is only relevant for 

435 # CalibRepoConverter. 

436 self._fileDatasets[dataset.refs[0].datasetType][None].append(dataset) 

437 self.task.log.info("Finding datasets from files in repo %s.", self.root) 

438 datasetsByTypeAndCalibDate = self._repoWalker.walk( 

439 self.root, 

440 predicate=(self.subset.isRelated if self.subset is not None else None) 

441 ) 

442 for datasetType, datasetsByCalibDate in datasetsByTypeAndCalibDate.items(): 

443 for calibDate, datasets in datasetsByCalibDate.items(): 

444 self._fileDatasets[datasetType][calibDate].extend(datasets) 

445 self._fileDatasetCount += len(datasets) 

446 

447 def expandDataIds(self): 

448 """Expand the data IDs for all datasets to be inserted. 

449 

450 Subclasses may override this method, but must delegate to the base 

451 class implementation if they do. 

452 

453 This involves queries to the registry, but not writes. It is 

454 guaranteed to be called between `findDatasets` and `ingest`. 

455 """ 

456 import itertools 

457 with self.progress.bar(desc="Expanding data IDs", total=self._fileDatasetCount) as progressBar: 

458 for datasetType, datasetsByCalibDate in self._fileDatasets.items(): 

459 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items(): 

460 if calibDate is not None: 

461 self.task.log.info("Expanding data IDs for %d dataset%s of type %s at calibDate %s.", 

462 *_log_msg_counter(datasetsForCalibDate), 

463 datasetType.name, 

464 calibDate) 

465 else: 

466 self.task.log.info("Expanding data IDs for %d non-calibration dataset%s of type %s.", 

467 *_log_msg_counter(datasetsForCalibDate), 

468 datasetType.name) 

469 expanded = [] 

470 for dataset in datasetsForCalibDate: 

471 for i, ref in enumerate(dataset.refs): 

472 self.task.log.debug("Expanding data ID %s.", ref.dataId) 

473 try: 

474 dataId = self.task.registry.expandDataId(ref.dataId) 

475 dataset.refs[i] = ref.expanded(dataId) 

476 except LookupError as err: 

477 self.task.log.warning("Skipping ingestion for '%s': %s", dataset.path, err) 

478 # Remove skipped datasets from multi-extension 

479 # FileDatasets 

480 dataset.refs[i] = None # We will strip off the `None`s after the loop. 

481 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs) 

482 if dataset.refs: 

483 expanded.append(dataset) 

484 progressBar.update() 

485 datasetsForCalibDate[:] = expanded 

486 

487 def ingest(self): 

488 """Insert converted datasets into the Gen3 repository. 

489 

490 Subclasses may override this method, but must delegate to the base 

491 class implementation at some point in their own logic. 

492 

493 This method is guaranteed to be called after `expandDataIds`. 

494 """ 

495 with self.progress.bar(desc="Ingesting converted datasets", 

496 total=self._fileDatasetCount) as progressBar: 

497 for datasetType, datasetsByCalibDate in self._fileDatasets.items(): 

498 self.task.registry.registerDatasetType(datasetType) 

499 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items(): 

500 try: 

501 run = self.getRun(datasetType.name, calibDate) 

502 except LookupError: 

503 self.task.log.warning(f"No run configured for dataset type {datasetType.name}.") 

504 continue 

505 self.task.log.info("Ingesting %d dataset%s into run %s of type %s.", 

506 *_log_msg_counter(datasetsForCalibDate), run, datasetType.name) 

507 try: 

508 self.task.registry.registerRun(run) 

509 self.task.butler3.ingest(*datasetsForCalibDate, transfer=self.task.config.transfer, 

510 run=run) 

511 progressBar.update(len(datasetsForCalibDate)) 

512 except LookupError as err: 

513 raise LookupError( 

514 f"Error expanding data ID for dataset type {datasetType.name}." 

515 ) from err 

516 

517 def finish(self) -> None: 

518 """Finish conversion of a repository. 

519 

520 This is run after ``ingest``, and delegates to `_finish`, which should 

521 be overridden by derived classes instead of this method. 

522 """ 

523 self._finish(self._fileDatasets, self._fileDatasetCount) 

524 

525 def _finish(self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], 

526 count: int) -> None: 

527 """Subclass implementation hook for `_finish`. 

528 

529 The default implementation does nothing. This is generally the best 

530 place to define and populate non-``RUN`` collections that may contain 

531 some of the datasets that have just been ingested. 

532 

533 Parameters 

534 ---------- 

535 datasets : `Mapping` 

536 Nested mapping containing all converted datasets. The outer 

537 mapping keys are `DatasetType` instances. Values are mappings from 

538 ``calibDate`` or `None` to a `list` of `FileDataset` instances. 

539 count : `int` 

540 Total number of `FileDataset` instances in ``datasets``. 

541 """ 

542 pass 

543 

544 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str: 

545 """Return the name of the run to insert instances of the given dataset 

546 type into in this collection. 

547 

548 Parameters 

549 ---------- 

550 datasetTypeName : `str` 

551 Name of the dataset type. 

552 calibDate : `str`, optional 

553 If not `None`, the "CALIBDATE" associated with this (calibration) 

554 dataset in the Gen2 data repository. 

555 

556 Returns 

557 ------- 

558 run : `str` 

559 Name of the `~lsst.daf.butler.CollectionType.RUN` collection. 

560 """ 

561 assert self._run is not None, "Method must be overridden if self._run is allowed to be None" 

562 assert calibDate is None, "Method must be overridden if calibDate is allowed to be not None" 

563 return self._run 

564 

565 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping 

566 ) -> Optional[StorageClass]: 

567 """Infer the Gen3 `StorageClass` from a dataset from a combination of 

568 configuration and Gen2 dataset type information. 

569 

570 datasetTypeName: `str` 

571 Name of the dataset type. 

572 mapping : `lsst.obs.base.mapping.Mapping` 

573 Mapping object used by the Gen2 `CameraMapper` to describe the 

574 dataset type. 

575 """ 

576 storageClassName = self.task.config.storageClasses.get(datasetTypeName) 

577 if storageClassName is None and mapping.python is not None: 

578 storageClassName = self.task.config.storageClasses.get(mapping.python, None) 

579 if storageClassName is None and mapping.persistable is not None: 

580 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None) 

581 if storageClassName is None and mapping.python is not None: 

582 unqualified = mapping.python.split(".")[-1] 

583 storageClassName = self.task.config.storageClasses.get(unqualified, None) 

584 if storageClassName is not None: 

585 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName) 

586 else: 

587 try: 

588 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable) 

589 except KeyError: 

590 storageClass = None 

591 if storageClass is None and mapping.python is not None: 

592 try: 

593 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified) 

594 except KeyError: 

595 pass 

596 if storageClass is None: 

597 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName) 

598 else: 

599 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName) 

600 return storageClass 

601 

602 # Class attributes that will be shadowed by public instance attributes; 

603 # defined here only for documentation purposes. 

604 

605 task: ConvertRepoTask 

606 """The parent task that constructed and uses this converter 

607 (`ConvertRepoTask`). 

608 """ 

609 

610 root: str 

611 """Root path to the Gen2 repository this converter manages (`str`). 

612 

613 This is a complete path, not relative to some other repository root. 

614 """ 

615 

616 subset: Optional[ConversionSubset] 

617 """An object that represents a filter to be applied to the datasets that 

618 are converted (`ConversionSubset` or `None`). 

619 """