Coverage for python/lsst/obs/base/gen2to3/repoWalker/builders.py: 35%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

134 statements  

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Classes used in `RepoWalker` construction. 

22 

23The objects here form a temporary tree that is pruned and then transformed 

24into a similar tree of `PathElementHandler` instances. See `BuilderNode` 

25method documentation for more information. 

26""" 

27from __future__ import annotations 

28 

29__all__ = ["BuilderSkipInput", "BuilderTargetInput", "BuilderTree"] 

30 

31from abc import ABC, abstractmethod 

32import os 

33import re 

34from typing import ( 

35 Any, 

36 Dict, 

37 List, 

38 Optional, 

39 Tuple, 

40) 

41 

42from lsst.daf.butler import DatasetType, DimensionUniverse, StorageClass, FormatterParameter, Progress 

43from ..translators import TranslatorFactory 

44from .parser import PathElementParser 

45from .scanner import PathElementHandler, DirectoryScanner 

46from .handlers import (IgnoreHandler, SubdirectoryHandler, SkipHandler, 

47 TargetFileHandler) 

48 

49 

50class BuilderNode(ABC): 

51 """Abstract interface for nodes in the temporary tree that is used to 

52 construct a `RepoWalker`. 

53 """ 

54 

55 @abstractmethod 

56 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

57 """Attempt to prune this node and its children from the tree. 

58 

59 Returns 

60 ------- 

61 replacement : `BuilderNode` 

62 The result of recursively pruning child nodes; often just ``self``. 

63 messages : `list` [`str`] 

64 Warning messages that should be logged by a parent node when a 

65 matching path element is encountered, if this node is pruned. 

66 prune : `bool` 

67 If `True`, this node may be pruned from the tree (but will not 

68 necessarily be - it may correspond to a path element that should 

69 be skipped with siblings that should not be). 

70 """ 

71 raise NotImplementedError() 

72 

73 @abstractmethod 

74 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

75 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

76 ) -> PathElementHandler: 

77 """Transform this node in the build tree into a corresponding 

78 `PathElementHandler`, recursing to any children. 

79 

80 Must be called after `prune`. 

81 

82 Parameters 

83 ---------- 

84 parser : `PathElementParser` 

85 An object that matches the path element the new handler is 

86 responsible for and extracts a (partial) Gen2 data ID from it. 

87 allKeys : `dict` [`str`, `type`] 

88 A mapping from Gen2 data ID key to the type of its value. Will 

89 contain all keys that may be extracted by the given parser, and 

90 possibly others. 

91 cumulativeKeys : `dict` [`str`, `type`], optional 

92 A dictionary containing key strings and types for Gen2 data ID keys 

93 that have been extracted from previous path elements for this 

94 template, including those extracted by ``parser``. 

95 

96 Returns 

97 ------- 

98 handler : `PathElementHandler` 

99 A new handler object. 

100 """ 

101 raise NotImplementedError() 

102 

103 

104class BuilderInput(BuilderNode): 

105 """An intermediate base for `BuilderNode` classes that are provided as 

106 direct inputs to a `RepoWalker`, and generally correspond to exactly one 

107 Gen2 dataset type. 

108 

109 Parameters 

110 ---------- 

111 template : `str` 

112 The complete Gen2 template to be matched (not just the template for 

113 one path element). 

114 keys : `dict` [`str`, `type`] 

115 A mapping from Gen2 data ID key to the type of its value. 

116 """ 

117 def __init__(self, template: str, keys: Dict[str, type]): 

118 self.template = template 

119 self.keys = keys 

120 self.elements = self.template.split(os.path.sep) 

121 

122 template: str 

123 """The complete Gen2 template to be matched (`str`). 

124 """ 

125 

126 keys: Dict[str, type] 

127 """A mapping from Gen2 data ID key to the type of its value 

128 (`dict` [`str`, `type`]). 

129 """ 

130 

131 elements: List[str] 

132 """The path elements (file or directory levels) of `template` 

133 (`list` of `str`). 

134 """ 

135 

136 

137class BuilderSkipInput(BuilderInput): 

138 """An input to a `RepoWalker` that indicates that matched files should be 

139 skipped, possibly with a warning message. 

140 

141 BuilderSkipInputs can be pruned. When they are not pruned, they build 

142 `SkipHandler` instances. 

143 

144 Parameters 

145 ---------- 

146 template : `str` 

147 The complete Gen2 template to be matched (not just the template for 

148 one path element). 

149 keys : `dict` [`str`, `type`] 

150 A mapping from Gen2 data ID key to the type of its value. 

151 message : `str`, optional 

152 If not `None`, a warning message that should be printed either when a 

153 matching file is enountered or a directory that may contain such files 

154 is skipped. 

155 isForFiles : `bool`, optional 

156 If `True` (default), this handler should be run on files. Otherwise it 

157 should be run on directories. 

158 """ 

159 def __init__(self, template: str, keys: Dict[str, type], message: Optional[str] = None, *, 

160 isForFiles: bool = True): 

161 super().__init__(template=template, keys=keys) 

162 self._message = message 

163 self._isForFiles = isForFiles 

164 

165 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

166 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

167 ) -> PathElementHandler: 

168 # Docstring inherited from BuilderNode. 

169 return SkipHandler(parser=parser, isForFiles=self._isForFiles, message=self._message) 

170 

171 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

172 # Docstring inherited from BuilderNode. 

173 return self, [self._message] if self._message is not None else [], True 

174 

175 

176class BuilderTargetInput(BuilderInput): 

177 """An input to a `RepoWalker` that matches files that correspond to 

178 datasets that we want to extract. 

179 

180 BuilderTargetInputs can never be pruned, and always build 

181 `TargetFileHandler` instances. 

182 

183 Parameters 

184 ---------- 

185 datasetTypeName : `str` 

186 Name of the dataset type. 

187 template : `str` 

188 Full Gen2 filename template. 

189 keys : `dict` [`str`, `type`] 

190 Dictionary that maps Gen2 data ID key to the type of its value. 

191 storageClass : `StorageClass` 

192 `StorageClass` for the Gen3 dataset type. 

193 universe : `DimensionUniverse` 

194 All candidate dimensions for the Gen3 dataset type. 

195 formatter : `lsst.daf.butler.Formatter` or `str`, optional 

196 A Gen 3 formatter class or fully-qualified name. 

197 translatorFactory : `TranslatorFactory` 

198 Object that can be used to construct data ID translators. 

199 targetHandler : `PathElementHandler`, optional 

200 Override target handler for this dataset type. 

201 **kwargs: 

202 Additional keyword arguments are passed to `Translator.makeMatching`, 

203 in along with ``datasetTypeName`` and ``keys``. 

204 """ 

205 def __init__(self, *, datasetTypeName: str, template: str, keys: Dict[str, type], 

206 storageClass: StorageClass, universe: DimensionUniverse, 

207 formatter: FormatterParameter, translatorFactory: TranslatorFactory, 

208 targetHandler: Optional[PathElementHandler] = None, 

209 **kwargs: Any): 

210 # strip off [%HDU] identifiers from e.g. DECAM Community Pipeline 

211 # products 

212 template = template.split('[%(')[0] 

213 super().__init__(template=template, keys=keys) 

214 self._translator = translatorFactory.makeMatching(datasetTypeName, keys, **kwargs) 

215 self.datasetType = DatasetType(datasetTypeName, dimensions=self._translator.dimensionNames, 

216 storageClass=storageClass, universe=universe, 

217 isCalibration=("calibDate" in keys)) 

218 self._formatter = formatter 

219 if targetHandler is None: 

220 targetHandler = TargetFileHandler 

221 self._handler = targetHandler 

222 

223 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

224 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

225 ) -> PathElementHandler: 

226 # Docstring inherited from BuilderNode. 

227 return self._handler(parser=parser, translator=self._translator, datasetType=self.datasetType, 

228 formatter=self._formatter) 

229 

230 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

231 # Docstring inherited from BuilderNode. 

232 return self, [], False 

233 

234 datasetType: DatasetType 

235 """The Gen3 dataset type extracted by the handler this object builds 

236 (`lsst.daf.butler.DatasetType`). 

237 """ 

238 

239 

240class BuilderPrunedTree(BuilderNode): 

241 """A `BuilderNode` that represents a subdirectory to be skipped, 

242 created by pruning `BuilderTree` that contained only `BuilderSkipInput` 

243 instances. 

244 

245 BuilderPrunedTrees can be pruned. When they are not pruned, they 

246 build `SkipHandler` instances. 

247 

248 Parameters 

249 ---------- 

250 messages : `list` [`str`] 

251 A list of warning messages to be printed when the handler produced by 

252 this builder matches a subdirectory. 

253 """ 

254 

255 def __init__(self, messages: List[str]): 

256 self._messages = messages 

257 

258 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

259 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

260 ) -> PathElementHandler: 

261 # Docstring inherited from BuilderNode. 

262 message = "; ".join(self._messages) if self._messages else None 

263 return SkipHandler(parser=parser, isForFiles=False, message=message) 

264 

265 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

266 # Docstring inherited from BuilderNode. 

267 return self, self._messages, True 

268 

269 

270class BuilderDuplicateInputs(BuilderNode): 

271 """A `BuilderNode` that represents a collection of `BuilderInput` instances 

272 that all have the same template. 

273 """ 

274 def __init__(self, old: BuilderInput, new: BuilderInput): 

275 self._children = [] 

276 if isinstance(old, BuilderDuplicateInputs): 

277 self._children.extend(old._children) 

278 else: 

279 self._children.append(old) 

280 self._children.append(new) 

281 self._messages = [] # populated in prune() 

282 

283 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

284 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

285 ) -> PathElementHandler: 

286 # Docstring inherited from BuilderNode. 

287 message = "; ".join(self._messages) if self._messages else None 

288 return SkipHandler(parser=parser, isForFiles=False, message=message) 

289 

290 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

291 # Docstring inherited from BuilderNode. 

292 unprunable = [] 

293 newChildren = [] 

294 for child in self._children: 

295 newChild, childMessages, toPruneChild = child.prune() 

296 if toPruneChild: 

297 self._messages.extend(childMessages) 

298 else: 

299 unprunable.append(newChild) 

300 newChildren.append(newChildren) 

301 self._children = newChildren 

302 if len(unprunable) == 0: 

303 # All children are just skips, so we can prune this node if we 

304 # remember their messages. 

305 return self, self._messages, True 

306 elif len(unprunable) == 1 and not self._messages: 

307 # Exactly one child is a target, and the others were ignored with 

308 # no warning messages. Tell parent node to just use that child, 

309 # so if we see any matching files, we just assume they're for that 

310 # target. 

311 return unprunable[0], [], False 

312 else: 

313 # Multiple targets or skips with messages, which means we won't 

314 # know how to handle any matching files. Replace any messages we 

315 # have with a single message that combines them all as well as 

316 # any target dataset types that they are ambiguous with. 

317 nested = [f"{c.datasetType.name} (target)" for c in unprunable] 

318 nested.extend(self._messages) 

319 self._messages = [f"ambiguous match: [{', '.join(nested)}]"] 

320 return self, self._messages, True 

321 

322 

323class BuilderTree(BuilderNode): 

324 """A `BuilderNode` that represents a directory. 

325 

326 This is the only `BuilderNode` class that is not a leaf node. If all 

327 of its children can be pruned, it is replaced by a `BuilderPrunedTree` 

328 (which can then be pruned itself). It builds `SubdirectoryHandler` 

329 instances when not pruned. 

330 

331 Parameters 

332 ---------- 

333 progress : `Progress`, optional 

334 Object to use to report incremental progress. 

335 """ 

336 def __init__(self, progress: Optional[Progress] = None): 

337 self._children = {} # Maps template path element to BuilderNode 

338 self.progress = progress 

339 

340 MAX_PROGRESS_LEVEL: int = 2 

341 """Maximum directory level at which progress bars are created. 

342 """ 

343 

344 def insert(self, level: int, leaf: BuilderInput): 

345 """Insert an input leaf node into the tree, recursively constructing 

346 intermediate parents in order to put it at the right level. 

347 

348 Parameters 

349 ---------- 

350 level : `int` 

351 The level ``self``is at in the larger tree, with zero the 

352 repository root. The right level for the leaf is given by the 

353 length of ``leaf.elements``. 

354 leaf : `BuilderInput` 

355 The leaf node to insert. 

356 """ 

357 nextLevel = level + 1 

358 element = leaf.elements[level] 

359 if nextLevel == len(leaf.elements): 

360 conflict = self._children.get(element) 

361 if conflict is not None: 

362 # Sadly, the Gen2 butler has some actual dataset types that 

363 # use the exact same template. 

364 leaf = BuilderDuplicateInputs(conflict, leaf) 

365 self._children[element] = leaf 

366 else: 

367 progress = self.progress if nextLevel <= self.MAX_PROGRESS_LEVEL else None 

368 child = self._children.setdefault(element, BuilderTree(progress)) 

369 child.insert(nextLevel, leaf) 

370 

371 def fill(self, scanner: DirectoryScanner, allKeys: Dict[str, type], previousKeys: Dict[str, type], *, 

372 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]): 

373 """Fill a `DirectoryScanner` instance by recursively building all 

374 child nodes. 

375 

376 Parameters 

377 ---------- 

378 scanner : `DirectoryScanner` 

379 Object to populate. 

380 allKeys : `dict` [`str`, `type`] 

381 Mapping from Gen2 data ID key to its value type, covering all keys 

382 that could be used in any child template. 

383 previousKeys : `dict` [`str`, `type`], optional 

384 A dictionary containing key strings and types for Gen2 data ID keys 

385 that have been extracted from previous path elements of the same 

386 template. 

387 fileIgnoreRegEx : `re.Pattern`, optional 

388 A regular expression pattern that identifies non-dataset files that 

389 can be ignored, to be applied at all levels of the directory tree. 

390 dirIgnoreRegEx : `re.Pattern`, optional 

391 A regular expression pattern that identifies non-dataset 

392 subdirectories that can be ignored, to be applied at all levels of 

393 the directory tree. 

394 """ 

395 if fileIgnoreRegEx is not None: 

396 scanner.add(IgnoreHandler(fileIgnoreRegEx, isForFiles=True)) 

397 if dirIgnoreRegEx is not None: 

398 scanner.add(IgnoreHandler(dirIgnoreRegEx, isForFiles=False)) 

399 for template, child in self._children.items(): 

400 parser = PathElementParser(template, allKeys, previousKeys=previousKeys) 

401 cumulativeKeys = previousKeys.copy() 

402 cumulativeKeys.update(parser.keys) 

403 scanner.add(child.build(parser, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx, 

404 dirIgnoreRegEx=dirIgnoreRegEx)) 

405 

406 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

407 # Docstring inherited from BuilderNode. 

408 toPruneThis = True 

409 newChildren = {} 

410 messages = [] 

411 # Recursively prune children. 

412 for template, child in list(self._children.items()): 

413 newChild, childMessages, toPruneChild = child.prune() 

414 newChildren[template] = newChild 

415 messages.extend(childMessages) 

416 if not toPruneChild: 

417 toPruneThis = False 

418 self._children = newChildren 

419 if toPruneThis: 

420 return BuilderPrunedTree(messages), messages, True 

421 else: 

422 return self, [], False 

423 

424 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

425 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

426 ) -> PathElementHandler: 

427 # Docstring inherited from BuilderNode. 

428 built = SubdirectoryHandler(parser, progress=self.progress) 

429 self.fill(built.scanner, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx, 

430 dirIgnoreRegEx=dirIgnoreRegEx) 

431 return built