Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RepoConverter"] 

24 

25from dataclasses import dataclass 

26from collections import defaultdict 

27from abc import ABC, abstractmethod 

28import fnmatch 

29import os.path 

30import re 

31from typing import ( 

32 Dict, 

33 Iterator, 

34 List, 

35 Mapping, 

36 Optional, 

37 Set, 

38 Tuple, 

39 Union, 

40 TYPE_CHECKING, 

41) 

42 

43from lsst.utils import doImport 

44from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType 

45from lsst.sphgeom import RangeSet, Region 

46from .repoWalker import RepoWalker 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping 

50 from .convertRepo import ConvertRepoTask 

51 from .scanner import PathElementHandler 

52 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension, FormatterParameter 

53 from .._instrument import Instrument 

54 

55 

56@dataclass 

57class ConversionSubset: 

58 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 

59 lists of related data ID values that should be included in the conversion. 

60 

61 Parameters 

62 ---------- 

63 instrument : `str` 

64 Instrument name used in Gen3 data IDs. 

65 visits : `set` of `int` 

66 Visit IDs that define the filter. 

67 """ 

68 

69 def __init__(self, instrument: str, visits: Set[int]): 

70 self.instrument = instrument 

71 self.visits = visits 

72 self.regions = None 

73 self.tracts = {} 

74 self.skypix = {} 

75 

76 def addSkyMap(self, registry: Registry, name: str): 

77 """Populate the included tract IDs for the given skymap from those that 

78 overlap the visits the `ConversionSubset` was initialized with. 

79 

80 Parameters 

81 ---------- 

82 registry : `lsst.daf.butler.Registry` 

83 Registry that can be queried for visit/tract overlaps. 

84 name : `str` 

85 SkyMap name used in Gen3 data IDs. 

86 """ 

87 tracts = set() 

88 self.tracts[name] = tracts 

89 for visit in self.visits: 

90 for dataId in registry.queryDataIds(["tract"], 

91 dataId={"skymap": name, 

92 "instrument": self.instrument, 

93 "visit": visit}): 

94 tracts.add(dataId["tract"]) 

95 

96 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension): 

97 """Populate the included skypix IDs for the given dimension from those 

98 that overlap the visits the `ConversionSubset` was initialized with. 

99 

100 Parameters 

101 ---------- 

102 registry : `lsst.daf.butler.Registry` 

103 Registry that can be queried for visit regions. 

104 name : `str` 

105 SkyMap name used in Gen3 data IDs. 

106 """ 

107 if self.regions is None: 

108 self.regions = [] 

109 for visit in self.visits: 

110 dataId = registry.expandDataId(instrument=self.instrument, visit=visit) 

111 self.regions.append(dataId.region) 

112 ranges = RangeSet() 

113 for region in self.regions: 

114 ranges = ranges.union(dimension.pixelization.envelope(region)) 

115 self.skypix[dimension] = ranges 

116 

117 def isRelated(self, dataId: DataCoordinate) -> bool: 

118 """Test whether the given data ID is related to this subset and hence 

119 should be included in a repository conversion. 

120 

121 Parameters 

122 ---------- 

123 dataId : `lsst.daf.butler.DataCoordinate` 

124 Data ID to test. 

125 

126 Returns 

127 ------- 

128 related : `bool` 

129 `True` if this data ID should be included in a repository 

130 conversion. 

131 

132 Notes 

133 ----- 

134 More formally, this tests that the given data ID is not unrelated; 

135 if a data ID does not involve tracts, visits, or skypix dimensions, 

136 we always include it. 

137 """ 

138 if self.visits is None: 

139 # We're not filtering at all. 

140 return True 

141 if "visit" in dataId.graph and dataId["visit"] not in self.visits: 

142 return False 

143 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]: 

144 return False 

145 for dimension, ranges in self.skypix.items(): 

146 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]): 

147 return False 

148 return True 

149 

150 # Class attributes that will be shadowed by public instance attributes; 

151 # defined here only for documentation purposes. 

152 

153 instrument: str 

154 """The name of the instrument, as used in Gen3 data IDs (`str`). 

155 """ 

156 

157 visits: Set[int] 

158 """The set of visit IDs that should be included in the conversion (`set` 

159 of `int`). 

160 """ 

161 

162 regions: Optional[List[Region]] 

163 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 

164 

165 Set to `None` before it has been initialized. Any code that attempts to 

166 use it when it is `None` has a logic bug. 

167 """ 

168 

169 tracts: Dict[str, Set[int]] 

170 """Tracts that should be included in the conversion, grouped by skymap 

171 name (`dict` mapping `str` to `set` of `int`). 

172 """ 

173 

174 skypix: Dict[SkyPixDimension, RangeSet] 

175 """SkyPix ranges that should be included in the conversion, grouped by 

176 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 

177 """ 

178 

179 

180class RepoConverter(ABC): 

181 """An abstract base class for objects that help `ConvertRepoTask` convert 

182 datasets from a single Gen2 repository. 

183 

184 Parameters 

185 ---------- 

186 task : `ConvertRepoTask` 

187 Task instance that is using this helper object. 

188 root : `str` 

189 Root of the Gen2 repo being converted. Will be converted to an 

190 absolute path, resolving symbolic links and ``~``, if necessary. 

191 instrument : `Instrument` 

192 Gen3 instrument class to use for this conversion. 

193 collections : `list` of `str` 

194 Gen3 collections with which all converted datasets should be 

195 associated. 

196 subset : `ConversionSubset, optional 

197 Helper object that implements a filter that restricts the data IDs that 

198 are converted. 

199 

200 Notes 

201 ----- 

202 `RepoConverter` defines the only public API users of its subclasses should 

203 use (`prep` `ingest`, and `finish`). These delegate to several abstract 

204 methods that subclasses must implement. In some cases, subclasses may 

205 reimplement the public methods as well, but are expected to delegate to 

206 ``super()`` either at the beginning or end of their own implementation. 

207 """ 

208 

209 def __init__(self, *, task: ConvertRepoTask, root: str, instrument: Instrument, run: Optional[str], 

210 subset: Optional[ConversionSubset] = None): 

211 self.task = task 

212 self.root = os.path.realpath(os.path.expanduser(root)) 

213 self.instrument = instrument 

214 self.subset = subset 

215 self._run = run 

216 self._repoWalker = None # Created in prep 

217 self._fileDatasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]] \ 

218 = defaultdict(lambda: defaultdict(list)) 

219 

220 @abstractmethod 

221 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

222 """Test whether the given dataset is handled specially by this 

223 converter and hence should be ignored by generic base-class logic that 

224 searches for dataset types to convert. 

225 

226 Parameters 

227 ---------- 

228 datasetTypeName : `str` 

229 Name of the dataset type to test. 

230 

231 Returns 

232 ------- 

233 special : `bool` 

234 `True` if the dataset type is special. 

235 """ 

236 raise NotImplementedError() 

237 

238 @abstractmethod 

239 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]: 

240 """Iterate over all `CameraMapper` `Mapping` objects that should be 

241 considered for conversion by this repository. 

242 

243 This this should include any datasets that may appear in the 

244 repository, including those that are special (see 

245 `isDatasetTypeSpecial`) and those that are being ignored (see 

246 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 

247 to identify and hence skip these datasets quietly instead of warning 

248 about them as unrecognized. 

249 

250 Yields 

251 ------ 

252 datasetTypeName: `str` 

253 Name of the dataset type. 

254 mapping : `lsst.obs.base.mapping.Mapping` 

255 Mapping object used by the Gen2 `CameraMapper` to describe the 

256 dataset type. 

257 """ 

258 raise NotImplementedError() 

259 

260 @abstractmethod 

261 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type], 

262 storageClass: StorageClass, 

263 formatter: FormatterParameter = None, 

264 targetHandler: Optional[PathElementHandler] = None, 

265 ) -> RepoWalker.Target: 

266 """Make a struct that identifies a dataset type to be extracted by 

267 walking the repo directory structure. 

268 

269 Parameters 

270 ---------- 

271 datasetTypeName : `str` 

272 Name of the dataset type (the same in both Gen2 and Gen3). 

273 template : `str` 

274 The full Gen2 filename template. 

275 keys : `dict` [`str`, `type`] 

276 A dictionary mapping Gen2 data ID key to the type of its value. 

277 storageClass : `lsst.daf.butler.StorageClass` 

278 Gen3 storage class for this dataset type. 

279 formatter : `lsst.daf.butler.Formatter` or `str`, optional 

280 A Gen 3 formatter class or fully-qualified name. 

281 targetHandler : `PathElementHandler`, optional 

282 Specialist target handler to use for this dataset type. 

283 

284 Returns 

285 ------- 

286 target : `RepoWalker.Target` 

287 A struct containing information about the target dataset (much of 

288 it simplify forwarded from the arguments). 

289 """ 

290 raise NotImplementedError() 

291 

292 def getSpecialDirectories(self) -> List[str]: 

293 """Return a list of directory paths that should not be searched for 

294 files. 

295 

296 These may be directories that simply do not contain datasets (or 

297 contain datasets in another repository), or directories whose datasets 

298 are handled specially by a subclass. 

299 

300 Returns 

301 ------- 

302 directories : `list` [`str`] 

303 The full paths of directories to skip, relative to the repository 

304 root. 

305 """ 

306 return [] 

307 

308 def prep(self): 

309 """Perform preparatory work associated with the dataset types to be 

310 converted from this repository (but not the datasets themselves). 

311 

312 Notes 

313 ----- 

314 This should be a relatively fast operation that should not depend on 

315 the size of the repository. 

316 

317 Subclasses may override this method, but must delegate to the base 

318 class implementation at some point in their own logic. 

319 More often, subclasses will specialize the behavior of `prep` by 

320 overriding other methods to which the base class implementation 

321 delegates. These include: 

322 - `iterMappings` 

323 - `isDatasetTypeSpecial` 

324 - `getSpecialDirectories` 

325 - `makeRepoWalkerTarget` 

326 

327 This should not perform any write operations to the Gen3 repository. 

328 It is guaranteed to be called before `ingest`. 

329 """ 

330 self.task.log.info(f"Preparing other dataset types from root {self.root}.") 

331 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = [] 

332 for datasetTypeName, mapping in self.iterMappings(): 

333 try: 

334 template = mapping.template 

335 except RuntimeError: 

336 # No template for this dataset in this mapper, so there's no 

337 # way there should be instances of this dataset in this repo. 

338 continue 

339 extensions = [""] 

340 skip = False 

341 message = None 

342 storageClass = None 

343 if (not self.task.isDatasetTypeIncluded(datasetTypeName) 

344 or self.isDatasetTypeSpecial(datasetTypeName)): 

345 # User indicated not to include this data, but we still want 

346 # to recognize files of that type to avoid warning about them. 

347 skip = True 

348 else: 

349 storageClass = self._guessStorageClass(datasetTypeName, mapping) 

350 if storageClass is None: 

351 # This may be a problem, but only if we actually encounter 

352 # any files corresponding to this dataset. Of course, we 

353 # need to be able to parse those files in order to 

354 # recognize that situation. 

355 message = f"no storage class found for {datasetTypeName}" 

356 skip = True 

357 # Handle files that are compressed on disk, but the gen2 template 

358 # is just `.fits` 

359 if template.endswith(".fits"): 

360 extensions.extend((".gz", ".fz")) 

361 for extension in extensions: 

362 if skip: 

363 walkerInput = RepoWalker.Skip( 

364 template=template+extension, 

365 keys=mapping.keys(), 

366 message=message, 

367 ) 

368 self.task.log.debug("Skipping template in walker: %s", template) 

369 else: 

370 assert message is None 

371 targetHandler = self.task.config.targetHandlerClasses.get(datasetTypeName) 

372 if targetHandler is not None: 

373 targetHandler = doImport(targetHandler) 

374 walkerInput = self.makeRepoWalkerTarget( 

375 datasetTypeName=datasetTypeName, 

376 template=template+extension, 

377 keys=mapping.keys(), 

378 storageClass=storageClass, 

379 formatter=self.task.config.formatterClasses.get(datasetTypeName), 

380 targetHandler=targetHandler, 

381 ) 

382 self.task.log.debug("Adding template to walker: %s + %s, for %s", template, extension, 

383 walkerInput.datasetType) 

384 walkerInputs.append(walkerInput) 

385 

386 for dirPath in self.getSpecialDirectories(): 

387 walkerInputs.append( 

388 RepoWalker.Skip( 

389 template=dirPath, # not really a template, but that's fine; it's relative to root. 

390 keys={}, 

391 message=None, 

392 isForFiles=True, 

393 ) 

394 ) 

395 fileIgnoreRegExTerms = [] 

396 for pattern in self.task.config.fileIgnorePatterns: 

397 fileIgnoreRegExTerms.append(fnmatch.translate(pattern)) 

398 if fileIgnoreRegExTerms: 

399 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms)) 

400 else: 

401 fileIgnoreRegEx = None 

402 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx, 

403 log=self.task.log.getChild("repoWalker")) 

404 

405 def iterDatasets(self) -> Iterator[FileDataset]: 

406 """Iterate over datasets in the repository that should be ingested into 

407 the Gen3 repository. 

408 

409 The base class implementation yields nothing; the datasets handled by 

410 the `RepoConverter` base class itself are read directly in 

411 `findDatasets`. 

412 

413 Subclasses should override this method if they support additional 

414 datasets that are handled some other way. 

415 

416 Yields 

417 ------ 

418 dataset : `FileDataset` 

419 Structures representing datasets to be ingested. Paths should be 

420 absolute. 

421 """ 

422 yield from () 

423 

424 def findDatasets(self): 

425 assert self._repoWalker, "prep() must be called before findDatasets." 

426 self.task.log.info("Adding special datasets in repo %s.", self.root) 

427 for dataset in self.iterDatasets(): 

428 assert len(dataset.refs) == 1 

429 # None index below is for calibDate, which is only relevant for 

430 # CalibRepoConverter. 

431 self._fileDatasets[dataset.refs[0].datasetType][None].append(dataset) 

432 self.task.log.info("Finding datasets from files in repo %s.", self.root) 

433 datasetsByTypeAndCalibDate = self._repoWalker.walk( 

434 self.root, 

435 predicate=(self.subset.isRelated if self.subset is not None else None) 

436 ) 

437 for datasetType, datasetsByCalibDate in datasetsByTypeAndCalibDate.items(): 

438 for calibDate, datasets in datasetsByCalibDate.items(): 

439 self._fileDatasets[datasetType][calibDate].extend(datasets) 

440 

441 def expandDataIds(self): 

442 """Expand the data IDs for all datasets to be inserted. 

443 

444 Subclasses may override this method, but must delegate to the base 

445 class implementation if they do. 

446 

447 This involves queries to the registry, but not writes. It is 

448 guaranteed to be called between `findDatasets` and `ingest`. 

449 """ 

450 import itertools 

451 for datasetType, datasetsByCalibDate in self._fileDatasets.items(): 

452 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items(): 

453 nDatasets = len(datasetsForCalibDate) 

454 suffix = "" if nDatasets == 1 else "s" 

455 if calibDate is not None: 

456 self.task.log.info("Expanding data IDs for %s %s dataset%s at calibDate %s.", 

457 nDatasets, 

458 datasetType.name, 

459 suffix, 

460 calibDate) 

461 else: 

462 self.task.log.info("Expanding data IDs for %s %s non-calibration dataset%s.", 

463 nDatasets, 

464 datasetType.name, 

465 suffix) 

466 expanded = [] 

467 for dataset in datasetsForCalibDate: 

468 for i, ref in enumerate(dataset.refs): 

469 self.task.log.debug("Expanding data ID %s.", ref.dataId) 

470 try: 

471 dataId = self.task.registry.expandDataId(ref.dataId) 

472 dataset.refs[i] = ref.expanded(dataId) 

473 except LookupError as err: 

474 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err) 

475 # Remove skipped datasets from multi-extension 

476 # FileDatasets 

477 dataset.refs[i] = None # We will strip off the `None`s after the loop. 

478 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs) 

479 if dataset.refs: 

480 expanded.append(dataset) 

481 datasetsForCalibDate[:] = expanded 

482 

483 def ingest(self): 

484 """Insert converted datasets into the Gen3 repository. 

485 

486 Subclasses may override this method, but must delegate to the base 

487 class implementation at some point in their own logic. 

488 

489 This method is guaranteed to be called after `expandDataIds`. 

490 """ 

491 for datasetType, datasetsByCalibDate in self._fileDatasets.items(): 

492 self.task.registry.registerDatasetType(datasetType) 

493 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items(): 

494 try: 

495 run = self.getRun(datasetType.name, calibDate) 

496 except LookupError: 

497 self.task.log.warn(f"No run configured for dataset type {datasetType.name}.") 

498 continue 

499 nDatasets = len(datasetsForCalibDate) 

500 self.task.log.info("Ingesting %s %s dataset%s into run %s.", nDatasets, 

501 datasetType.name, "" if nDatasets == 1 else "s", run) 

502 try: 

503 self.task.registry.registerRun(run) 

504 self.task.butler3.ingest(*datasetsForCalibDate, transfer=self.task.config.transfer, 

505 run=run) 

506 except LookupError as err: 

507 raise LookupError( 

508 f"Error expanding data ID for dataset type {datasetType.name}." 

509 ) from err 

510 

511 def finish(self) -> None: 

512 """Finish conversion of a repository. 

513 

514 This is run after ``ingest``, and delegates to `_finish`, which should 

515 be overridden by derived classes instead of this method. 

516 """ 

517 self._finish(self._fileDatasets) 

518 

519 def _finish(self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]]) -> None: 

520 """Subclass implementation hook for `_finish`. 

521 

522 The default implementation does nothing. This is generally the best 

523 place to define and populate non-``RUN`` collections that may contain 

524 some of the datasets that have just been ingested. 

525 

526 Parameters 

527 ---------- 

528 datasets : `Mapping` 

529 Nested mapping containing all converted datasets. The outer 

530 mapping keys are `DatasetType` instances. Values are mappings from 

531 ``calibDate`` or `None` to a `list` of `FileDataset` instances. 

532 """ 

533 pass 

534 

535 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str: 

536 """Return the name of the run to insert instances of the given dataset 

537 type into in this collection. 

538 

539 Parameters 

540 ---------- 

541 datasetTypeName : `str` 

542 Name of the dataset type. 

543 calibDate : `str`, optional 

544 If not `None`, the "CALIBDATE" associated with this (calibration) 

545 dataset in the Gen2 data repository. 

546 

547 Returns 

548 ------- 

549 run : `str` 

550 Name of the `~lsst.daf.butler.CollectionType.RUN` collection. 

551 """ 

552 assert self._run is not None, "Method must be overridden if self._run is allowed to be None" 

553 assert calibDate is None, "Method must be overridden if calibDate is allowed to be not None" 

554 return self._run 

555 

556 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping 

557 ) -> Optional[StorageClass]: 

558 """Infer the Gen3 `StorageClass` from a dataset from a combination of 

559 configuration and Gen2 dataset type information. 

560 

561 datasetTypeName: `str` 

562 Name of the dataset type. 

563 mapping : `lsst.obs.base.mapping.Mapping` 

564 Mapping object used by the Gen2 `CameraMapper` to describe the 

565 dataset type. 

566 """ 

567 storageClassName = self.task.config.storageClasses.get(datasetTypeName) 

568 if storageClassName is None and mapping.python is not None: 

569 storageClassName = self.task.config.storageClasses.get(mapping.python, None) 

570 if storageClassName is None and mapping.persistable is not None: 

571 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None) 

572 if storageClassName is None and mapping.python is not None: 

573 unqualified = mapping.python.split(".")[-1] 

574 storageClassName = self.task.config.storageClasses.get(unqualified, None) 

575 if storageClassName is not None: 

576 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName) 

577 else: 

578 try: 

579 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable) 

580 except KeyError: 

581 storageClass = None 

582 if storageClass is None and mapping.python is not None: 

583 try: 

584 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified) 

585 except KeyError: 

586 pass 

587 if storageClass is None: 

588 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName) 

589 else: 

590 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName) 

591 return storageClass 

592 

593 # Class attributes that will be shadowed by public instance attributes; 

594 # defined here only for documentation purposes. 

595 

596 task: ConvertRepoTask 

597 """The parent task that constructed and uses this converter 

598 (`ConvertRepoTask`). 

599 """ 

600 

601 root: str 

602 """Root path to the Gen2 repository this converter manages (`str`). 

603 

604 This is a complete path, not relative to some other repository root. 

605 """ 

606 

607 subset: Optional[ConversionSubset] 

608 """An object that represents a filter to be applied to the datasets that 

609 are converted (`ConversionSubset` or `None`). 

610 """