Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Classes used in `RepoWalker` construction. 

22 

23The objects here form a temporary tree that is pruned and then transformed 

24into a similar tree of `PathElementHandler` instances. See `BuilderNode` 

25method documentation for more information. 

26""" 

27from __future__ import annotations 

28 

29__all__ = ["BuilderSkipInput", "BuilderTargetInput", "BuilderTree"] 

30 

31from abc import ABC, abstractmethod 

32import os 

33import re 

34from typing import ( 

35 Any, 

36 Dict, 

37 List, 

38 Optional, 

39 Tuple, 

40) 

41 

42from lsst.daf.butler import DatasetType, DimensionUniverse, StorageClass 

43from ..translators import Translator 

44from .parser import PathElementParser 

45from .scanner import PathElementHandler, DirectoryScanner 

46from .handlers import IgnoreHandler, SubdirectoryHandler, SkipHandler, TargetFileHandler 

47 

48 

49class BuilderNode(ABC): 

50 """Abstract interface for nodes in the temporary tree that is used to 

51 construct a `RepoWalker`. 

52 """ 

53 

54 @abstractmethod 

55 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

56 """Attempt to prune this node and its children from the tree. 

57 

58 Returns 

59 ------- 

60 replacement : `BuilderNode` 

61 The result of recursively pruning child nodes; often just ``self``. 

62 messages : `list` [`str`] 

63 Warning messages that should be logged by a parent node when a 

64 matching path element is encountered, if this node is pruned. 

65 prune : `bool` 

66 If `True`, this node may be pruned from the tree (but will not 

67 necessarily be - it may correspond to a path element that should 

68 be skipped with siblings that should not be). 

69 """ 

70 raise NotImplementedError() 

71 

72 @abstractmethod 

73 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

74 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

75 ) -> PathElementHandler: 

76 """Transform this node in the build tree into a corresponding 

77 `PathElementHandler`, recursing to any children. 

78 

79 Must be called after `prune`. 

80 

81 Parameters 

82 ---------- 

83 parser : `PathElementParser` 

84 An object that matches the path element the new handler is 

85 responsible for and extracts a (partial) Gen2 data ID from it. 

86 allKeys : `dict` [`str`, `type`] 

87 A mapping from Gen2 data ID key to the type of its value. Will 

88 contain all keys that may be extracted by the given parser, and 

89 possibly others. 

90 cumulativeKeys : `dict` [`str`, `type`], optional 

91 A dictionary containing key strings and types for Gen2 data ID keys 

92 that have been extracted from previous path elements for this 

93 template, including those extracted by ``parser``. 

94 

95 Returns 

96 ------- 

97 handler : `PathElementHandler` 

98 A new handler object. 

99 """ 

100 raise NotImplementedError() 

101 

102 

103class BuilderInput(BuilderNode): 

104 """An intermediate base for `BuilderNode` classes that are provided as 

105 direct inputs to a `RepoWalker`, and generally correspond to exactly one 

106 Gen2 dataset type. 

107 

108 Parameters 

109 ---------- 

110 template : `str` 

111 The complete Gen2 template to be matched (not just the template for 

112 one path element). 

113 keys : `dict` [`str`, `type`] 

114 A mapping from Gen2 data ID key to the type of its value. 

115 """ 

116 def __init__(self, template: str, keys: Dict[str, type]): 

117 self.template = template 

118 self.keys = keys 

119 self.elements = self.template.split(os.path.sep) 

120 

121 template: str 

122 """The complete Gen2 template to be matched (`str`). 

123 """ 

124 

125 keys: Dict[str, type] 

126 """A mapping from Gen2 data ID key to the type of its value 

127 (`dict` [`str`, `type`]). 

128 """ 

129 

130 elements: List[str] 

131 """The path elements (file or directory levels) of `template` 

132 (`list` of `str`). 

133 """ 

134 

135 

136class BuilderSkipInput(BuilderInput): 

137 """An input to a `RepoWalker` that indicates that matched files should be 

138 skipped, possibly with a warning message. 

139 

140 BuilderSkipInputs can be pruned. When they are not pruned, they build 

141 `SkipHandler` instances. 

142 

143 Parameters 

144 ---------- 

145 template : `str` 

146 The complete Gen2 template to be matched (not just the template for 

147 one path element). 

148 keys : `dict` [`str`, `type`] 

149 A mapping from Gen2 data ID key to the type of its value. 

150 message : `str`, optional 

151 If not `None`, a warning message that should be printed either when a 

152 matching file is enountered or a directory that may contain such files 

153 is skipped. 

154 isForFiles : `bool`, optional 

155 If `True` (default), this handler should be run on files. Otherwise it 

156 should be run on directories. 

157 """ 

158 def __init__(self, template: str, keys: Dict[str, type], message: Optional[str] = None, *, 

159 isForFiles: bool = True): 

160 super().__init__(template=template, keys=keys) 

161 self._message = message 

162 self._isForFiles = isForFiles 

163 

164 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

165 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

166 ) -> PathElementHandler: 

167 # Docstring inherited from BuilderNode. 

168 return SkipHandler(parser=parser, isForFiles=self._isForFiles, message=self._message) 

169 

170 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

171 # Docstring inherited from BuilderNode. 

172 return self, [self._message] if self._message is not None else [], True 

173 

174 

175class BuilderTargetInput(BuilderInput): 

176 """An input to a `RepoWalker` that matches files that correspond to 

177 datasets that we want to extract. 

178 

179 BuilderTargetInputs can never be pruned, and always build 

180 `TargetFileHandler` instances. 

181 

182 Parameters 

183 ---------- 

184 datasetTypeName : `str` 

185 Name of the dataset type. 

186 template : `str` 

187 Full Gen2 filename template. 

188 keys : `dict` [`str`, `type`] 

189 Dictionary that maps Gen2 data ID key to the type of its value. 

190 storageClass : `StorageClass` 

191 `StorageClass` for the Gen3 dataset type. 

192 universe : `DimensionUniverse` 

193 All candidate dimensions for the Gen3 dataset type. 

194 kwargs: 

195 Additional keyword argumetns are passed to `Translator.makeMatching`, 

196 in along with ``datasetTypeName`` and ``keys``. 

197 """ 

198 def __init__(self, *, datasetTypeName: str, template: str, keys: Dict[str, type], 

199 storageClass: StorageClass, universe: DimensionUniverse, **kwargs: Any): 

200 super().__init__(template=template, keys=keys) 

201 self._translator = Translator.makeMatching(datasetTypeName, keys, **kwargs) 

202 self.datasetType = DatasetType(datasetTypeName, dimensions=self._translator.dimensionNames, 

203 storageClass=storageClass, universe=universe) 

204 

205 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

206 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

207 ) -> PathElementHandler: 

208 # Docstring inherited from BuilderNode. 

209 return TargetFileHandler(parser=parser, translator=self._translator, datasetType=self.datasetType) 

210 

211 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

212 # Docstring inherited from BuilderNode. 

213 return self, [], False 

214 

215 datasetType: DatasetType 

216 """The Gen3 dataset type extracted by the hander this object builds 

217 (`lsst.daf.butler.DatasetType`). 

218 """ 

219 

220 

221class BuilderPrunedTree(BuilderNode): 

222 """A `BuilderNode` that represents a subdirectory to be skipped, 

223 created by pruning `BuilderTree` that contained only `BuilderSkipInput` 

224 instances. 

225 

226 BuilderPrunedTrees can be pruned. When they are not pruned, they 

227 build `SkipHandler` instances. 

228 

229 Parameters 

230 ---------- 

231 messages : `list` [`str`] 

232 A list of warning messages to be printed when the handler produced by 

233 this builder matches a subdirectory. 

234 """ 

235 

236 def __init__(self, messages: List[str]): 

237 self._messages = messages 

238 

239 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

240 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

241 ) -> PathElementHandler: 

242 # Docstring inherited from BuilderNode. 

243 message = "; ".join(self._messages) if self._messages else None 

244 return SkipHandler(parser=parser, isForFiles=False, message=message) 

245 

246 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

247 # Docstring inherited from BuilderNode. 

248 return self, self._messages, True 

249 

250 

251class BuilderDuplicateInputs(BuilderNode): 

252 """A `BuilderNode` that represents a collection of `BuilderInput` instances 

253 that all have the same template. 

254 """ 

255 def __init__(self, old: BuilderInput, new: BuilderInput): 

256 self._children = [] 

257 if isinstance(old, BuilderDuplicateInputs): 

258 self._children.extend(old._children) 

259 else: 

260 self._children.append(old) 

261 self._children.append(new) 

262 self._messages = [] # populated in prune() 

263 

264 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

265 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

266 ) -> PathElementHandler: 

267 # Docstring inherited from BuilderNode. 

268 message = "; ".join(self._messages) if self._messages else None 

269 return SkipHandler(parser=parser, isForFiles=False, message=message) 

270 

271 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

272 # Docstring inherited from BuilderNode. 

273 unprunable = [] 

274 newChildren = [] 

275 for child in self._children: 

276 newChild, childMessages, toPruneChild = child.prune() 

277 if toPruneChild: 

278 self._messages.extend(childMessages) 

279 else: 

280 unprunable.append(newChild) 

281 newChildren.append(newChildren) 

282 self._children = newChildren 

283 if len(unprunable) == 0: 

284 # All children are just skips, so we can prune this node if we 

285 # remember their messages. 

286 return self, self._messages, True 

287 elif len(unprunable) == 1 and not self._messages: 

288 # Exactly one child is a target, and the others were ignored with 

289 # no warning messages. Tell parent node to just use that child, 

290 # so if we see any matching files, we just assume they're for that 

291 # target. 

292 return unprunable[0], [], False 

293 else: 

294 # Multiple targets or skips with messages, which means we won't 

295 # know how to handle any matching files. Replace any messages we 

296 # have with a single message that combines them all as well as 

297 # any target dataset types that they are ambiguous with. 

298 nested = [f"{c.datasetType.name} (target)" for c in unprunable] 

299 nested.extend(self._messages) 

300 self._messages = [f"ambiguous match: [{', '.join(nested)}]"] 

301 return self, self._messages, True 

302 

303 

304class BuilderTree(BuilderNode): 

305 """A `BuilderNode` that represents a directory. 

306 

307 This is the only `BuilderNode` class that is not a leaf node. If all 

308 of its children can be pruned, it is replaced by a `BuilderPrunedTree` 

309 (which can then be pruned itself). It builds `SubdirectoryHandler` 

310 instances when not pruned. 

311 """ 

312 def __init__(self): 

313 self._children = {} # Maps template path element to BuilderNode 

314 

315 def insert(self, level: int, leaf: BuilderInput): 

316 """Insert an input leaf node into the tree, recursively constructing 

317 intermediate parents in order to put it at the right level. 

318 

319 Parameters 

320 ---------- 

321 level : `int` 

322 The level ``self``is at in the larger tree, with zero the 

323 repository root. The right level for the leaf is given by the 

324 length of ``leaf.elements``. 

325 leaf : `BuilderInput` 

326 The leaf node to insert. 

327 """ 

328 nextLevel = level + 1 

329 element = leaf.elements[level] 

330 if nextLevel == len(leaf.elements): 

331 conflict = self._children.get(element) 

332 if conflict is not None: 

333 # Sadly, the Gen2 butler has some actual dataset types that 

334 # use the exact same template. 

335 leaf = BuilderDuplicateInputs(conflict, leaf) 

336 self._children[element] = leaf 

337 else: 

338 child = self._children.setdefault(element, BuilderTree()) 

339 child.insert(nextLevel, leaf) 

340 

341 def fill(self, scanner: DirectoryScanner, allKeys: Dict[str, type], previousKeys: Dict[str, type], *, 

342 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]): 

343 """Fill a `DirectoryScanner` instance by recursively building all 

344 child nodes. 

345 

346 Parameters 

347 ---------- 

348 scanner : `DirectoryScanner` 

349 Object to populate. 

350 allKeys : `dict` [`str`, `type`] 

351 Mapping from Gen2 data ID key to its value type, covering all keys 

352 that could be used in any child template. 

353 previousKeys : `dict` [`str`, `type`], optional 

354 A dictionary containing key strings and types for Gen2 data ID keys 

355 that have been extracted from previous path elements of the same 

356 template. 

357 """ 

358 if fileIgnoreRegEx is not None: 

359 scanner.add(IgnoreHandler(fileIgnoreRegEx, isForFiles=True)) 

360 if dirIgnoreRegEx is not None: 

361 scanner.add(IgnoreHandler(dirIgnoreRegEx, isForFiles=False)) 

362 for template, child in self._children.items(): 

363 parser = PathElementParser(template, allKeys, previousKeys=previousKeys) 

364 cumulativeKeys = previousKeys.copy() 

365 cumulativeKeys.update(parser.keys) 

366 scanner.add(child.build(parser, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx, 

367 dirIgnoreRegEx=dirIgnoreRegEx)) 

368 

369 def prune(self) -> Tuple[BuilderNode, List[str], bool]: 

370 # Docstring inherited from BuilderNode. 

371 toPruneThis = True 

372 newChildren = {} 

373 messages = [] 

374 # Recursively prune children. 

375 for template, child in list(self._children.items()): 

376 newChild, childMessages, toPruneChild = child.prune() 

377 newChildren[template] = newChild 

378 messages.extend(childMessages) 

379 if not toPruneChild: 

380 toPruneThis = False 

381 self._children = newChildren 

382 if toPruneThis: 

383 return BuilderPrunedTree(messages), messages, True 

384 else: 

385 return self, [], False 

386 

387 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *, 

388 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern] 

389 ) -> PathElementHandler: 

390 # Docstring inherited from BuilderNode. 

391 built = SubdirectoryHandler(parser) 

392 self.fill(built.scanner, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx, 

393 dirIgnoreRegEx=dirIgnoreRegEx) 

394 return built