Coverage for python/lsst/obs/base/gen2to3/repoWalker/builders.py: 36%

137 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-06-14 02:56 -0700

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Classes used in `RepoWalker` construction. 

22 

23The objects here form a temporary tree that is pruned and then transformed 

24into a similar tree of `PathElementHandler` instances. See `BuilderNode` 

25method documentation for more information. 

26""" 

27from __future__ import annotations 

28 

29__all__ = ["BuilderSkipInput", "BuilderTargetInput", "BuilderTree"] 

30 

31import os 

32import re 

33from abc import ABC, abstractmethod 

34from typing import Any, Dict, List, Optional, Tuple 

35 

36from lsst.daf.butler import DatasetType, DimensionUniverse, FormatterParameter, Progress, StorageClass 

37 

38from ..translators import TranslatorFactory 

39from .handlers import IgnoreHandler, SkipHandler, SubdirectoryHandler, TargetFileHandler 

40from .parser import PathElementParser 

41from .scanner import DirectoryScanner, PathElementHandler 

42 

43 

44class BuilderNode(ABC): 

45 """Abstract interface for nodes in the temporary tree that is used to 

46 construct a `RepoWalker`. 

47 """ 

48 

49 @abstractmethod 

50 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

51 """Attempt to prune this node and its children from the tree. 

52 

53 Returns 

54 ------- 

55 replacement : `BuilderNode` 

56 The result of recursively pruning child nodes; often just ``self``. 

57 messages : `list` [`str`] 

58 Warning messages that should be logged by a parent node when a 

59 matching path element is encountered, if this node is pruned. 

60 prune : `bool` 

61 If `True`, this node may be pruned from the tree (but will not 

62 necessarily be - it may correspond to a path element that should 

63 be skipped with siblings that should not be). 

64 """ 

65 raise NotImplementedError() 

66 

67 @abstractmethod 

68 def build( 

69 self, 

70 parser: PathElementParser, 

71 allKeys: Dict[str, type], 

72 cumulativeKeys: Dict[str, type], 

73 *, 

74 fileIgnoreRegEx: Optional[re.Pattern], 

75 dirIgnoreRegEx: Optional[re.Pattern], 

76 ) -> PathElementHandler: 

77 """Transform this node in the build tree into a corresponding 

78 `PathElementHandler`, recursing to any children. 

79 

80 Must be called after `prune`. 

81 

82 Parameters 

83 ---------- 

84 parser : `PathElementParser` 

85 An object that matches the path element the new handler is 

86 responsible for and extracts a (partial) Gen2 data ID from it. 

87 allKeys : `dict` [`str`, `type`] 

88 A mapping from Gen2 data ID key to the type of its value. Will 

89 contain all keys that may be extracted by the given parser, and 

90 possibly others. 

91 cumulativeKeys : `dict` [`str`, `type`], optional 

92 A dictionary containing key strings and types for Gen2 data ID keys 

93 that have been extracted from previous path elements for this 

94 template, including those extracted by ``parser``. 

95 

96 Returns 

97 ------- 

98 handler : `PathElementHandler` 

99 A new handler object. 

100 """ 

101 raise NotImplementedError() 

102 

103 

104class BuilderInput(BuilderNode): 

105 """An intermediate base for `BuilderNode` classes that are provided as 

106 direct inputs to a `RepoWalker`, and generally correspond to exactly one 

107 Gen2 dataset type. 

108 

109 Parameters 

110 ---------- 

111 template : `str` 

112 The complete Gen2 template to be matched (not just the template for 

113 one path element). 

114 keys : `dict` [`str`, `type`] 

115 A mapping from Gen2 data ID key to the type of its value. 

116 """ 

117 

118 def __init__(self, template: str, keys: Dict[str, type]): 

119 self.template = template 

120 self.keys = keys 

121 self.elements = self.template.split(os.path.sep) 

122 

123 template: str 

124 """The complete Gen2 template to be matched (`str`). 

125 """ 

126 

127 keys: Dict[str, type] 

128 """A mapping from Gen2 data ID key to the type of its value 

129 (`dict` [`str`, `type`]). 

130 """ 

131 

132 elements: List[str] 

133 """The path elements (file or directory levels) of `template` 

134 (`list` of `str`). 

135 """ 

136 

137 

138class BuilderSkipInput(BuilderInput): 

139 """An input to a `RepoWalker` that indicates that matched files should be 

140 skipped, possibly with a warning message. 

141 

142 BuilderSkipInputs can be pruned. When they are not pruned, they build 

143 `SkipHandler` instances. 

144 

145 Parameters 

146 ---------- 

147 template : `str` 

148 The complete Gen2 template to be matched (not just the template for 

149 one path element). 

150 keys : `dict` [`str`, `type`] 

151 A mapping from Gen2 data ID key to the type of its value. 

152 message : `str`, optional 

153 If not `None`, a warning message that should be printed either when a 

154 matching file is enountered or a directory that may contain such files 

155 is skipped. 

156 isForFiles : `bool`, optional 

157 If `True` (default), this handler should be run on files. Otherwise it 

158 should be run on directories. 

159 """ 

160 

161 def __init__( 

162 self, template: str, keys: Dict[str, type], message: Optional[str] = None, *, isForFiles: bool = True 

163 ): 

164 super().__init__(template=template, keys=keys) 

165 self._message = message 

166 self._isForFiles = isForFiles 

167 

168 def build( 

169 self, 

170 parser: PathElementParser, 

171 allKeys: Dict[str, type], 

172 cumulativeKeys: Dict[str, type], 

173 *, 

174 fileIgnoreRegEx: Optional[re.Pattern], 

175 dirIgnoreRegEx: Optional[re.Pattern], 

176 ) -> PathElementHandler: 

177 # Docstring inherited from BuilderNode. 

178 return SkipHandler(parser=parser, isForFiles=self._isForFiles, message=self._message) 

179 

180 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

181 # Docstring inherited from BuilderNode. 

182 return self, [self._message] if self._message is not None else [], True 

183 

184 

185class BuilderTargetInput(BuilderInput): 

186 """An input to a `RepoWalker` that matches files that correspond to 

187 datasets that we want to extract. 

188 

189 BuilderTargetInputs can never be pruned, and always build 

190 `TargetFileHandler` instances. 

191 

192 Parameters 

193 ---------- 

194 datasetTypeName : `str` 

195 Name of the dataset type. 

196 template : `str` 

197 Full Gen2 filename template. 

198 keys : `dict` [`str`, `type`] 

199 Dictionary that maps Gen2 data ID key to the type of its value. 

200 storageClass : `StorageClass` 

201 `StorageClass` for the Gen3 dataset type. 

202 universe : `DimensionUniverse` 

203 All candidate dimensions for the Gen3 dataset type. 

204 formatter : `lsst.daf.butler.Formatter` or `str`, optional 

205 A Gen 3 formatter class or fully-qualified name. 

206 translatorFactory : `TranslatorFactory` 

207 Object that can be used to construct data ID translators. 

208 targetHandler : `PathElementHandler`, optional 

209 Override target handler for this dataset type. 

210 **kwargs: 

211 Additional keyword arguments are passed to `Translator.makeMatching`, 

212 in along with ``datasetTypeName`` and ``keys``. 

213 """ 

214 

215 def __init__( 

216 self, 

217 *, 

218 datasetTypeName: str, 

219 template: str, 

220 keys: Dict[str, type], 

221 storageClass: StorageClass, 

222 universe: DimensionUniverse, 

223 formatter: FormatterParameter, 

224 translatorFactory: TranslatorFactory, 

225 targetHandler: Optional[PathElementHandler] = None, 

226 **kwargs: Any, 

227 ): 

228 # strip off [%HDU] identifiers from e.g. DECAM Community Pipeline 

229 # products 

230 template = template.split("[%(")[0] 

231 super().__init__(template=template, keys=keys) 

232 self._translator = translatorFactory.makeMatching(datasetTypeName, keys, **kwargs) 

233 self.datasetType = DatasetType( 

234 datasetTypeName, 

235 dimensions=self._translator.dimensionNames, 

236 storageClass=storageClass, 

237 universe=universe, 

238 isCalibration=("calibDate" in keys), 

239 ) 

240 self._formatter = formatter 

241 if targetHandler is None: 

242 targetHandler = TargetFileHandler 

243 self._handler = targetHandler 

244 

245 def build( 

246 self, 

247 parser: PathElementParser, 

248 allKeys: Dict[str, type], 

249 cumulativeKeys: Dict[str, type], 

250 *, 

251 fileIgnoreRegEx: Optional[re.Pattern], 

252 dirIgnoreRegEx: Optional[re.Pattern], 

253 ) -> PathElementHandler: 

254 # Docstring inherited from BuilderNode. 

255 return self._handler( 

256 parser=parser, 

257 translator=self._translator, 

258 datasetType=self.datasetType, 

259 formatter=self._formatter, 

260 ) 

261 

262 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

263 # Docstring inherited from BuilderNode. 

264 return self, [], False 

265 

266 datasetType: DatasetType 

267 """The Gen3 dataset type extracted by the handler this object builds 

268 (`lsst.daf.butler.DatasetType`). 

269 """ 

270 

271 

272class BuilderPrunedTree(BuilderNode): 

273 """A `BuilderNode` that represents a subdirectory to be skipped, 

274 created by pruning `BuilderTree` that contained only `BuilderSkipInput` 

275 instances. 

276 

277 BuilderPrunedTrees can be pruned. When they are not pruned, they 

278 build `SkipHandler` instances. 

279 

280 Parameters 

281 ---------- 

282 messages : `list` [`str`] 

283 A list of warning messages to be printed when the handler produced by 

284 this builder matches a subdirectory. 

285 """ 

286 

287 def __init__(self, messages: List[str]): 

288 self._messages = messages 

289 

290 def build( 

291 self, 

292 parser: PathElementParser, 

293 allKeys: Dict[str, type], 

294 cumulativeKeys: Dict[str, type], 

295 *, 

296 fileIgnoreRegEx: Optional[re.Pattern], 

297 dirIgnoreRegEx: Optional[re.Pattern], 

298 ) -> PathElementHandler: 

299 # Docstring inherited from BuilderNode. 

300 message = "; ".join(self._messages) if self._messages else None 

301 return SkipHandler(parser=parser, isForFiles=False, message=message) 

302 

303 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

304 # Docstring inherited from BuilderNode. 

305 return self, self._messages, True 

306 

307 

308class BuilderDuplicateInputs(BuilderNode): 

309 """A `BuilderNode` that represents a collection of `BuilderInput` instances 

310 that all have the same template. 

311 """ 

312 

313 def __init__(self, old: BuilderInput, new: BuilderInput): 

314 self._children = [] 

315 if isinstance(old, BuilderDuplicateInputs): 

316 self._children.extend(old._children) 

317 else: 

318 self._children.append(old) 

319 self._children.append(new) 

320 self._messages = [] # populated in prune() 

321 

322 def build( 

323 self, 

324 parser: PathElementParser, 

325 allKeys: Dict[str, type], 

326 cumulativeKeys: Dict[str, type], 

327 *, 

328 fileIgnoreRegEx: Optional[re.Pattern], 

329 dirIgnoreRegEx: Optional[re.Pattern], 

330 ) -> PathElementHandler: 

331 # Docstring inherited from BuilderNode. 

332 message = "; ".join(self._messages) if self._messages else None 

333 return SkipHandler(parser=parser, isForFiles=False, message=message) 

334 

335 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

336 # Docstring inherited from BuilderNode. 

337 unprunable = [] 

338 newChildren = [] 

339 for child in self._children: 

340 newChild, childMessages, toPruneChild = child.prune() 

341 if toPruneChild: 

342 self._messages.extend(childMessages) 

343 else: 

344 unprunable.append(newChild) 

345 newChildren.append(newChildren) 

346 self._children = newChildren 

347 if len(unprunable) == 0: 

348 # All children are just skips, so we can prune this node if we 

349 # remember their messages. 

350 return self, self._messages, True 

351 elif len(unprunable) == 1 and not self._messages: 

352 # Exactly one child is a target, and the others were ignored with 

353 # no warning messages. Tell parent node to just use that child, 

354 # so if we see any matching files, we just assume they're for that 

355 # target. 

356 return unprunable[0], [], False 

357 else: 

358 # Multiple targets or skips with messages, which means we won't 

359 # know how to handle any matching files. Replace any messages we 

360 # have with a single message that combines them all as well as 

361 # any target dataset types that they are ambiguous with. 

362 nested = [f"{c.datasetType.name} (target)" for c in unprunable] 

363 nested.extend(self._messages) 

364 self._messages = [f"ambiguous match: [{', '.join(nested)}]"] 

365 return self, self._messages, True 

366 

367 

368class BuilderTree(BuilderNode): 

369 """A `BuilderNode` that represents a directory. 

370 

371 This is the only `BuilderNode` class that is not a leaf node. If all 

372 of its children can be pruned, it is replaced by a `BuilderPrunedTree` 

373 (which can then be pruned itself). It builds `SubdirectoryHandler` 

374 instances when not pruned. 

375 

376 Parameters 

377 ---------- 

378 progress : `Progress`, optional 

379 Object to use to report incremental progress. 

380 """ 

381 

382 def __init__(self, progress: Optional[Progress] = None): 

383 self._children = {} # Maps template path element to BuilderNode 

384 self.progress = progress 

385 

386 MAX_PROGRESS_LEVEL: int = 2 

387 """Maximum directory level at which progress bars are created. 

388 """ 

389 

390 def insert(self, level: int, leaf: BuilderInput): 

391 """Insert an input leaf node into the tree, recursively constructing 

392 intermediate parents in order to put it at the right level. 

393 

394 Parameters 

395 ---------- 

396 level : `int` 

397 The level ``self``is at in the larger tree, with zero the 

398 repository root. The right level for the leaf is given by the 

399 length of ``leaf.elements``. 

400 leaf : `BuilderInput` 

401 The leaf node to insert. 

402 """ 

403 nextLevel = level + 1 

404 element = leaf.elements[level] 

405 if nextLevel == len(leaf.elements): 

406 conflict = self._children.get(element) 

407 if conflict is not None: 

408 # Sadly, the Gen2 butler has some actual dataset types that 

409 # use the exact same template. 

410 leaf = BuilderDuplicateInputs(conflict, leaf) 

411 self._children[element] = leaf 

412 else: 

413 progress = self.progress if nextLevel <= self.MAX_PROGRESS_LEVEL else None 

414 child = self._children.setdefault(element, BuilderTree(progress)) 

415 child.insert(nextLevel, leaf) 

416 

417 def fill( 

418 self, 

419 scanner: DirectoryScanner, 

420 allKeys: Dict[str, type], 

421 previousKeys: Dict[str, type], 

422 *, 

423 fileIgnoreRegEx: Optional[re.Pattern], 

424 dirIgnoreRegEx: Optional[re.Pattern], 

425 ): 

426 """Fill a `DirectoryScanner` instance by recursively building all 

427 child nodes. 

428 

429 Parameters 

430 ---------- 

431 scanner : `DirectoryScanner` 

432 Object to populate. 

433 allKeys : `dict` [`str`, `type`] 

434 Mapping from Gen2 data ID key to its value type, covering all keys 

435 that could be used in any child template. 

436 previousKeys : `dict` [`str`, `type`], optional 

437 A dictionary containing key strings and types for Gen2 data ID keys 

438 that have been extracted from previous path elements of the same 

439 template. 

440 fileIgnoreRegEx : `re.Pattern`, optional 

441 A regular expression pattern that identifies non-dataset files that 

442 can be ignored, to be applied at all levels of the directory tree. 

443 dirIgnoreRegEx : `re.Pattern`, optional 

444 A regular expression pattern that identifies non-dataset 

445 subdirectories that can be ignored, to be applied at all levels of 

446 the directory tree. 

447 """ 

448 if fileIgnoreRegEx is not None: 

449 scanner.add(IgnoreHandler(fileIgnoreRegEx, isForFiles=True)) 

450 if dirIgnoreRegEx is not None: 

451 scanner.add(IgnoreHandler(dirIgnoreRegEx, isForFiles=False)) 

452 for template, child in self._children.items(): 

453 parser = PathElementParser(template, allKeys, previousKeys=previousKeys) 

454 cumulativeKeys = previousKeys.copy() 

455 cumulativeKeys.update(parser.keys) 

456 scanner.add( 

457 child.build( 

458 parser, 

459 allKeys, 

460 cumulativeKeys, 

461 fileIgnoreRegEx=fileIgnoreRegEx, 

462 dirIgnoreRegEx=dirIgnoreRegEx, 

463 ) 

464 ) 

465 

466 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

467 # Docstring inherited from BuilderNode. 

468 toPruneThis = True 

469 newChildren = {} 

470 messages = [] 

471 # Recursively prune children. 

472 for template, child in list(self._children.items()): 

473 newChild, childMessages, toPruneChild = child.prune() 

474 newChildren[template] = newChild 

475 messages.extend(childMessages) 

476 if not toPruneChild: 

477 toPruneThis = False 

478 self._children = newChildren 

479 if toPruneThis: 

480 return BuilderPrunedTree(messages), messages, True 

481 else: 

482 return self, [], False 

483 

484 def build( 

485 self, 

486 parser: PathElementParser, 

487 allKeys: Dict[str, type], 

488 cumulativeKeys: Dict[str, type], 

489 *, 

490 fileIgnoreRegEx: Optional[re.Pattern], 

491 dirIgnoreRegEx: Optional[re.Pattern], 

492 ) -> PathElementHandler: 

493 # Docstring inherited from BuilderNode. 

494 built = SubdirectoryHandler(parser, progress=self.progress) 

495 self.fill( 

496 built.scanner, 

497 allKeys, 

498 cumulativeKeys, 

499 fileIgnoreRegEx=fileIgnoreRegEx, 

500 dirIgnoreRegEx=dirIgnoreRegEx, 

501 ) 

502 return built