Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RepoConverter"] 

24 

25from dataclasses import dataclass 

26from collections import defaultdict 

27from abc import ABC, abstractmethod 

28import fnmatch 

29import os.path 

30import re 

31from typing import ( 

32 Dict, 

33 Iterator, 

34 List, 

35 MutableMapping, 

36 Optional, 

37 Set, 

38 Tuple, 

39 Union, 

40 TYPE_CHECKING, 

41) 

42 

43from lsst.utils import doImport 

44from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType 

45from lsst.sphgeom import RangeSet, Region 

46from .repoWalker import RepoWalker 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping 

50 from .convertRepo import ConvertRepoTask 

51 from .scanner import PathElementHandler 

52 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension, FormatterParameter 

53 

54 

55@dataclass 

56class ConversionSubset: 

57 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 

58 lists of related data ID values that should be included in the conversion. 

59 

60 Parameters 

61 ---------- 

62 instrument : `str` 

63 Instrument name used in Gen3 data IDs. 

64 visits : `set` of `int` 

65 Visit IDs that define the filter. 

66 """ 

67 

68 def __init__(self, instrument: str, visits: Set[int]): 

69 self.instrument = instrument 

70 self.visits = visits 

71 self.regions = None 

72 self.tracts = {} 

73 self.skypix = {} 

74 

75 def addSkyMap(self, registry: Registry, name: str): 

76 """Populate the included tract IDs for the given skymap from those that 

77 overlap the visits the `ConversionSubset` was initialized with. 

78 

79 Parameters 

80 ---------- 

81 registry : `lsst.daf.butler.Registry` 

82 Registry that can be queried for visit/tract overlaps. 

83 name : `str` 

84 SkyMap name used in Gen3 data IDs. 

85 """ 

86 tracts = set() 

87 self.tracts[name] = tracts 

88 for visit in self.visits: 

89 for dataId in registry.queryDimensions(["tract"], expand=False, 

90 dataId={"skymap": name, 

91 "instrument": self.instrument, 

92 "visit": visit}): 

93 tracts.add(dataId["tract"]) 

94 

95 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension): 

96 """Populate the included skypix IDs for the given dimension from those 

97 that overlap the visits the `ConversionSubset` was initialized with. 

98 

99 Parameters 

100 ---------- 

101 registry : `lsst.daf.butler.Registry` 

102 Registry that can be queried for visit regions. 

103 name : `str` 

104 SkyMap name used in Gen3 data IDs. 

105 """ 

106 if self.regions is None: 

107 self.regions = [] 

108 for visit in self.visits: 

109 dataId = registry.expandDataId(instrument=self.instrument, visit=visit) 

110 self.regions.append(dataId.region) 

111 ranges = RangeSet() 

112 for region in self.regions: 

113 ranges = ranges.union(dimension.pixelization.envelope(region)) 

114 self.skypix[dimension] = ranges 

115 

116 def isRelated(self, dataId: DataCoordinate) -> bool: 

117 """Test whether the given data ID is related to this subset and hence 

118 should be included in a repository conversion. 

119 

120 Parameters 

121 ---------- 

122 dataId : `lsst.daf.butler.DataCoordinate` 

123 Data ID to test. 

124 

125 Returns 

126 ------- 

127 related : `bool` 

128 `True` if this data ID should be included in a repository 

129 conversion. 

130 

131 Notes 

132 ----- 

133 More formally, this tests that the given data ID is not unrelated; 

134 if a data ID does not involve tracts, visits, or skypix dimensions, 

135 we always include it. 

136 """ 

137 if self.visits is None: 

138 # We're not filtering at all. 

139 return True 

140 if "visit" in dataId.graph and dataId["visit"] not in self.visits: 

141 return False 

142 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]: 

143 return False 

144 for dimension, ranges in self.skypix.items(): 

145 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]): 

146 return False 

147 return True 

148 

149 # Class attributes that will be shadowed by public instance attributes; 

150 # defined here only for documentation purposes. 

151 

152 instrument: str 

153 """The name of the instrument, as used in Gen3 data IDs (`str`). 

154 """ 

155 

156 visits: Set[int] 

157 """The set of visit IDs that should be included in the conversion (`set` 

158 of `int`). 

159 """ 

160 

161 regions: Optional[List[Region]] 

162 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 

163 

164 Set to `None` before it has been initialized. Any code that attempts to 

165 use it when it is `None` has a logic bug. 

166 """ 

167 

168 tracts: Dict[str, Set[int]] 

169 """Tracts that should be included in the conversion, grouped by skymap 

170 name (`dict` mapping `str` to `set` of `int`). 

171 """ 

172 

173 skypix: Dict[SkyPixDimension, RangeSet] 

174 """SkyPix ranges that should be included in the conversion, grouped by 

175 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 

176 """ 

177 

178 

179class RepoConverter(ABC): 

180 """An abstract base class for objects that help `ConvertRepoTask` convert 

181 datasets from a single Gen2 repository. 

182 

183 Parameters 

184 ---------- 

185 task : `ConvertRepoTask` 

186 Task instance that is using this helper object. 

187 root : `str` 

188 Root of the Gen2 repo being converted. Will be converted to an 

189 absolute path, resolving symbolic links and ``~``, if necessary. 

190 collections : `list` of `str` 

191 Gen3 collections with which all converted datasets should be 

192 associated. 

193 subset : `ConversionSubset, optional 

194 Helper object that implements a filter that restricts the data IDs that 

195 are converted. 

196 

197 Notes 

198 ----- 

199 `RepoConverter` defines the only public API users of its subclasses should 

200 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 

201 several abstract methods that subclasses must implement. In some cases, 

202 subclasses may reimplement the public methods as well, but are expected to 

203 delegate to ``super()`` either at the beginning or end of their own 

204 implementation. 

205 """ 

206 

207 def __init__(self, *, task: ConvertRepoTask, root: str, run: Optional[str], 

208 subset: Optional[ConversionSubset] = None): 

209 self.task = task 

210 self.root = os.path.realpath(os.path.expanduser(root)) 

211 self.subset = subset 

212 self._run = run 

213 self._repoWalker = None # Created in prep 

214 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list) 

215 

216 @abstractmethod 

217 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

218 """Test whether the given dataset is handled specially by this 

219 converter and hence should be ignored by generic base-class logic that 

220 searches for dataset types to convert. 

221 

222 Parameters 

223 ---------- 

224 datasetTypeName : `str` 

225 Name of the dataset type to test. 

226 

227 Returns 

228 ------- 

229 special : `bool` 

230 `True` if the dataset type is special. 

231 """ 

232 raise NotImplementedError() 

233 

234 @abstractmethod 

235 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]: 

236 """Iterate over all `CameraMapper` `Mapping` objects that should be 

237 considered for conversion by this repository. 

238 

239 This this should include any datasets that may appear in the 

240 repository, including those that are special (see 

241 `isDatasetTypeSpecial`) and those that are being ignored (see 

242 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 

243 to identify and hence skip these datasets quietly instead of warning 

244 about them as unrecognized. 

245 

246 Yields 

247 ------ 

248 datasetTypeName: `str` 

249 Name of the dataset type. 

250 mapping : `lsst.obs.base.mapping.Mapping` 

251 Mapping object used by the Gen2 `CameraMapper` to describe the 

252 dataset type. 

253 """ 

254 raise NotImplementedError() 

255 

256 @abstractmethod 

257 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type], 

258 storageClass: StorageClass, 

259 formatter: FormatterParameter = None, 

260 targetHandler: Optional[PathElementHandler] = None, 

261 ) -> RepoWalker.Target: 

262 """Make a struct that identifies a dataset type to be extracted by 

263 walking the repo directory structure. 

264 

265 Parameters 

266 ---------- 

267 datasetTypeName : `str` 

268 Name of the dataset type (the same in both Gen2 and Gen3). 

269 template : `str` 

270 The full Gen2 filename template. 

271 keys : `dict` [`str`, `type`] 

272 A dictionary mapping Gen2 data ID key to the type of its value. 

273 storageClass : `lsst.daf.butler.StorageClass` 

274 Gen3 storage class for this dataset type. 

275 formatter : `lsst.daf.butler.Formatter` or `str`, optional 

276 A Gen 3 formatter class or fully-qualified name. 

277 targetHandler : `PathElementHandler`, optional 

278 Specialist target handler to use for this dataset type. 

279 

280 Returns 

281 ------- 

282 target : `RepoWalker.Target` 

283 A struct containing information about the target dataset (much of 

284 it simplify forwarded from the arguments). 

285 """ 

286 raise NotImplementedError() 

287 

288 def getSpecialDirectories(self) -> List[str]: 

289 """Return a list of directory paths that should not be searched for 

290 files. 

291 

292 These may be directories that simply do not contain datasets (or 

293 contain datasets in another repository), or directories whose datasets 

294 are handled specially by a subclass. 

295 

296 Returns 

297 ------- 

298 directories : `list` [`str`] 

299 The full paths of directories to skip, relative to the repository 

300 root. 

301 """ 

302 return [] 

303 

304 def prep(self): 

305 """Perform preparatory work associated with the dataset types to be 

306 converted from this repository (but not the datasets themselves). 

307 

308 Notes 

309 ----- 

310 This should be a relatively fast operation that should not depend on 

311 the size of the repository. 

312 

313 Subclasses may override this method, but must delegate to the base 

314 class implementation at some point in their own logic. 

315 More often, subclasses will specialize the behavior of `prep` by 

316 overriding other methods to which the base class implementation 

317 delegates. These include: 

318 - `iterMappings` 

319 - `isDatasetTypeSpecial` 

320 - `getSpecialDirectories` 

321 - `makeRepoWalkerTarget` 

322 

323 This should not perform any write operations to the Gen3 repository. 

324 It is guaranteed to be called before `insertDimensionData`. 

325 """ 

326 self.task.log.info(f"Preparing other dataset types from root {self.root}.") 

327 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = [] 

328 for datasetTypeName, mapping in self.iterMappings(): 

329 try: 

330 template = mapping.template 

331 except RuntimeError: 

332 # No template for this dataset in this mapper, so there's no 

333 # way there should be instances of this dataset in this repo. 

334 continue 

335 extensions = [""] 

336 skip = False 

337 message = None 

338 storageClass = None 

339 if (not self.task.isDatasetTypeIncluded(datasetTypeName) 

340 or self.isDatasetTypeSpecial(datasetTypeName)): 

341 # User indicated not to include this data, but we still want 

342 # to recognize files of that type to avoid warning about them. 

343 skip = True 

344 else: 

345 storageClass = self._guessStorageClass(datasetTypeName, mapping) 

346 if storageClass is None: 

347 # This may be a problem, but only if we actually encounter any 

348 # files corresponding to this dataset. Of course, we need 

349 # to be able to parse those files in order to recognize that 

350 # situation. 

351 message = f"no storage class found for {datasetTypeName}" 

352 skip = True 

353 # Handle files that are compressed on disk, but the gen2 template is just `.fits` 

354 if template.endswith(".fits"): 

355 extensions.extend((".gz", ".fz")) 

356 for extension in extensions: 

357 if skip: 

358 walkerInput = RepoWalker.Skip( 

359 template=template+extension, 

360 keys=mapping.keys(), 

361 message=message, 

362 ) 

363 self.task.log.debug("Skipping template in walker: %s", template) 

364 else: 

365 assert message is None 

366 targetHandler = self.task.config.targetHandlerClasses.get(datasetTypeName) 

367 if targetHandler is not None: 

368 targetHandler = doImport(targetHandler) 

369 walkerInput = self.makeRepoWalkerTarget( 

370 datasetTypeName=datasetTypeName, 

371 template=template+extension, 

372 keys=mapping.keys(), 

373 storageClass=storageClass, 

374 formatter=self.task.config.formatterClasses.get(datasetTypeName), 

375 targetHandler=targetHandler, 

376 ) 

377 self.task.log.debug("Adding template to walker: %s + %s, for %s", template, extension, 

378 walkerInput.datasetType) 

379 walkerInputs.append(walkerInput) 

380 

381 for dirPath in self.getSpecialDirectories(): 

382 walkerInputs.append( 

383 RepoWalker.Skip( 

384 template=dirPath, # not really a template, but that's fine; it's relative to root. 

385 keys={}, 

386 message=None, 

387 isForFiles=True, 

388 ) 

389 ) 

390 fileIgnoreRegExTerms = [] 

391 for pattern in self.task.config.fileIgnorePatterns: 

392 fileIgnoreRegExTerms.append(fnmatch.translate(pattern)) 

393 if fileIgnoreRegExTerms: 

394 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms)) 

395 else: 

396 fileIgnoreRegEx = None 

397 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx, 

398 log=self.task.log.getChild("repoWalker")) 

399 

400 def iterDatasets(self) -> Iterator[FileDataset]: 

401 """Iterate over datasets in the repository that should be ingested into 

402 the Gen3 repository. 

403 

404 The base class implementation yields nothing; the datasets handled by 

405 the `RepoConverter` base class itself are read directly in 

406 `findDatasets`. 

407 

408 Subclasses should override this method if they support additional 

409 datasets that are handled some other way. 

410 

411 Yields 

412 ------ 

413 dataset : `FileDataset` 

414 Structures representing datasets to be ingested. Paths should be 

415 absolute. 

416 """ 

417 yield from () 

418 

419 def findDatasets(self): 

420 assert self._repoWalker, "prep() must be called before findDatasets." 

421 self.task.log.info("Adding special datasets in repo %s.", self.root) 

422 for dataset in self.iterDatasets(): 

423 assert len(dataset.refs) == 1 

424 self._fileDatasets[dataset.refs[0].datasetType].append(dataset) 

425 self.task.log.info("Finding datasets from files in repo %s.", self.root) 

426 self._fileDatasets.update( 

427 self._repoWalker.walk( 

428 self.root, 

429 predicate=(self.subset.isRelated if self.subset is not None else None) 

430 ) 

431 ) 

432 

433 def insertDimensionData(self): 

434 """Insert any dimension records uniquely derived from this repository 

435 into the registry. 

436 

437 Subclasses may override this method, but may not need to; the default 

438 implementation does nothing. 

439 

440 SkyMap and SkyPix dimensions should instead be handled by calling 

441 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 

442 these dimensions are in general shared by multiple Gen2 repositories. 

443 

444 This method is guaranteed to be called between `prep` and 

445 `expandDataIds`. 

446 """ 

447 pass 

448 

449 def expandDataIds(self): 

450 """Expand the data IDs for all datasets to be inserted. 

451 

452 Subclasses may override this method, but must delegate to the base 

453 class implementation if they do. 

454 

455 This involves queries to the registry, but not writes. It is 

456 guaranteed to be called between `insertDimensionData` and `ingest`. 

457 """ 

458 import itertools 

459 for datasetType, datasetsForType in self._fileDatasets.items(): 

460 self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType), 

461 datasetType.name) 

462 expanded = [] 

463 for dataset in datasetsForType: 

464 for i, ref in enumerate(dataset.refs): 

465 try: 

466 dataId = self.task.registry.expandDataId(ref.dataId) 

467 dataset.refs[i] = ref.expanded(dataId) 

468 except LookupError as err: 

469 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err) 

470 # Remove skipped datasets from multi-extension FileDatasets 

471 dataset.refs[i] = None # We will strip off the `None`s after the loop. 

472 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs) 

473 if dataset.refs: 

474 expanded.append(dataset) 

475 

476 datasetsForType[:] = expanded 

477 

478 def ingest(self): 

479 """Insert converted datasets into the Gen3 repository. 

480 

481 Subclasses may override this method, but must delegate to the base 

482 class implementation at some point in their own logic. 

483 

484 This method is guaranteed to be called after `expandDataIds`. 

485 """ 

486 for datasetType, datasetsForType in self._fileDatasets.items(): 

487 self.task.registry.registerDatasetType(datasetType) 

488 try: 

489 run = self.getRun(datasetType.name) 

490 except LookupError: 

491 self.task.log.warn(f"No run configured for dataset type {datasetType.name}.") 

492 continue 

493 self.task.log.info("Ingesting %s %s datasets into run %s.", len(datasetsForType), 

494 datasetType.name, run) 

495 try: 

496 self.task.registry.registerRun(run) 

497 self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer, run=run) 

498 except LookupError as err: 

499 raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err 

500 

501 def getRun(self, datasetTypeName: str) -> str: 

502 """Return the name of the run to insert instances of the given dataset 

503 type into in this collection. 

504 

505 Parameters 

506 ---------- 

507 datasetTypeName : `str` 

508 Name of the dataset type. 

509 

510 Returns 

511 ------- 

512 run : `str` 

513 Name of the `~lsst.daf.butler.CollectionType.RUN` collection. 

514 """ 

515 assert self._run is not None, "Method must be overridden if self._run is allowed to be None" 

516 return self._run 

517 

518 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping 

519 ) -> Optional[StorageClass]: 

520 """Infer the Gen3 `StorageClass` from a dataset from a combination of 

521 configuration and Gen2 dataset type information. 

522 

523 datasetTypeName: `str` 

524 Name of the dataset type. 

525 mapping : `lsst.obs.base.mapping.Mapping` 

526 Mapping object used by the Gen2 `CameraMapper` to describe the 

527 dataset type. 

528 """ 

529 storageClassName = self.task.config.storageClasses.get(datasetTypeName) 

530 if storageClassName is None and mapping.python is not None: 

531 storageClassName = self.task.config.storageClasses.get(mapping.python, None) 

532 if storageClassName is None and mapping.persistable is not None: 

533 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None) 

534 if storageClassName is None and mapping.python is not None: 

535 unqualified = mapping.python.split(".")[-1] 

536 storageClassName = self.task.config.storageClasses.get(unqualified, None) 

537 if storageClassName is not None: 

538 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName) 

539 else: 

540 try: 

541 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable) 

542 except KeyError: 

543 storageClass = None 

544 if storageClass is None and mapping.python is not None: 

545 try: 

546 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified) 

547 except KeyError: 

548 pass 

549 if storageClass is None: 

550 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName) 

551 else: 

552 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName) 

553 return storageClass 

554 

555 # Class attributes that will be shadowed by public instance attributes; 

556 # defined here only for documentation purposes. 

557 

558 task: ConvertRepoTask 

559 """The parent task that constructed and uses this converter 

560 (`ConvertRepoTask`). 

561 """ 

562 

563 root: str 

564 """Root path to the Gen2 repository this converter manages (`str`). 

565 

566 This is a complete path, not relative to some other repository root. 

567 """ 

568 

569 subset: Optional[ConversionSubset] 

570 """An object that represents a filter to be applied to the datasets that 

571 are converted (`ConversionSubset` or `None`). 

572 """