Coverage for python/lsst/obs/base/gen2to3/repoWalker/builders.py : 28%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""Classes used in `RepoWalker` construction.
23The objects here form a temporary tree that is pruned and then transformed
24into a similar tree of `PathElementHandler` instances. See `BuilderNode`
25method documentation for more information.
26"""
27from __future__ import annotations
29__all__ = ["BuilderSkipInput", "BuilderTargetInput", "BuilderTree"]
31from abc import ABC, abstractmethod
32import os
33import re
34from typing import (
35 Any,
36 Dict,
37 List,
38 Optional,
39 Tuple,
40)
42from lsst.daf.butler import DatasetType, DimensionUniverse, StorageClass, FormatterParameter
43from ..translators import TranslatorFactory
44from .parser import PathElementParser
45from .scanner import PathElementHandler, DirectoryScanner
46from .handlers import (IgnoreHandler, SubdirectoryHandler, SkipHandler,
47 TargetFileHandler)
50class BuilderNode(ABC):
51 """Abstract interface for nodes in the temporary tree that is used to
52 construct a `RepoWalker`.
53 """
55 @abstractmethod
56 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
57 """Attempt to prune this node and its children from the tree.
59 Returns
60 -------
61 replacement : `BuilderNode`
62 The result of recursively pruning child nodes; often just ``self``.
63 messages : `list` [`str`]
64 Warning messages that should be logged by a parent node when a
65 matching path element is encountered, if this node is pruned.
66 prune : `bool`
67 If `True`, this node may be pruned from the tree (but will not
68 necessarily be - it may correspond to a path element that should
69 be skipped with siblings that should not be).
70 """
71 raise NotImplementedError()
73 @abstractmethod
74 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
75 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
76 ) -> PathElementHandler:
77 """Transform this node in the build tree into a corresponding
78 `PathElementHandler`, recursing to any children.
80 Must be called after `prune`.
82 Parameters
83 ----------
84 parser : `PathElementParser`
85 An object that matches the path element the new handler is
86 responsible for and extracts a (partial) Gen2 data ID from it.
87 allKeys : `dict` [`str`, `type`]
88 A mapping from Gen2 data ID key to the type of its value. Will
89 contain all keys that may be extracted by the given parser, and
90 possibly others.
91 cumulativeKeys : `dict` [`str`, `type`], optional
92 A dictionary containing key strings and types for Gen2 data ID keys
93 that have been extracted from previous path elements for this
94 template, including those extracted by ``parser``.
96 Returns
97 -------
98 handler : `PathElementHandler`
99 A new handler object.
100 """
101 raise NotImplementedError()
104class BuilderInput(BuilderNode):
105 """An intermediate base for `BuilderNode` classes that are provided as
106 direct inputs to a `RepoWalker`, and generally correspond to exactly one
107 Gen2 dataset type.
109 Parameters
110 ----------
111 template : `str`
112 The complete Gen2 template to be matched (not just the template for
113 one path element).
114 keys : `dict` [`str`, `type`]
115 A mapping from Gen2 data ID key to the type of its value.
116 """
117 def __init__(self, template: str, keys: Dict[str, type]):
118 self.template = template
119 self.keys = keys
120 self.elements = self.template.split(os.path.sep)
122 template: str
123 """The complete Gen2 template to be matched (`str`).
124 """
126 keys: Dict[str, type]
127 """A mapping from Gen2 data ID key to the type of its value
128 (`dict` [`str`, `type`]).
129 """
131 elements: List[str]
132 """The path elements (file or directory levels) of `template`
133 (`list` of `str`).
134 """
137class BuilderSkipInput(BuilderInput):
138 """An input to a `RepoWalker` that indicates that matched files should be
139 skipped, possibly with a warning message.
141 BuilderSkipInputs can be pruned. When they are not pruned, they build
142 `SkipHandler` instances.
144 Parameters
145 ----------
146 template : `str`
147 The complete Gen2 template to be matched (not just the template for
148 one path element).
149 keys : `dict` [`str`, `type`]
150 A mapping from Gen2 data ID key to the type of its value.
151 message : `str`, optional
152 If not `None`, a warning message that should be printed either when a
153 matching file is enountered or a directory that may contain such files
154 is skipped.
155 isForFiles : `bool`, optional
156 If `True` (default), this handler should be run on files. Otherwise it
157 should be run on directories.
158 """
159 def __init__(self, template: str, keys: Dict[str, type], message: Optional[str] = None, *,
160 isForFiles: bool = True):
161 super().__init__(template=template, keys=keys)
162 self._message = message
163 self._isForFiles = isForFiles
165 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
166 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
167 ) -> PathElementHandler:
168 # Docstring inherited from BuilderNode.
169 return SkipHandler(parser=parser, isForFiles=self._isForFiles, message=self._message)
171 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
172 # Docstring inherited from BuilderNode.
173 return self, [self._message] if self._message is not None else [], True
176class BuilderTargetInput(BuilderInput):
177 """An input to a `RepoWalker` that matches files that correspond to
178 datasets that we want to extract.
180 BuilderTargetInputs can never be pruned, and always build
181 `TargetFileHandler` instances.
183 Parameters
184 ----------
185 datasetTypeName : `str`
186 Name of the dataset type.
187 template : `str`
188 Full Gen2 filename template.
189 keys : `dict` [`str`, `type`]
190 Dictionary that maps Gen2 data ID key to the type of its value.
191 storageClass : `StorageClass`
192 `StorageClass` for the Gen3 dataset type.
193 universe : `DimensionUniverse`
194 All candidate dimensions for the Gen3 dataset type.
195 formatter : `lsst.daf.butler.Formatter` or `str`, optional
196 A Gen 3 formatter class or fully-qualified name.
197 translatorFactory : `TranslatorFactory`
198 Object that can be used to construct data ID translators.
199 targetHandler : `PathElementHandler`, optional
200 Override target handler for this dataset type.
201 **kwargs:
202 Additional keyword arguments are passed to `Translator.makeMatching`,
203 in along with ``datasetTypeName`` and ``keys``.
204 """
205 def __init__(self, *, datasetTypeName: str, template: str, keys: Dict[str, type],
206 storageClass: StorageClass, universe: DimensionUniverse,
207 formatter: FormatterParameter, translatorFactory: TranslatorFactory,
208 targetHandler: Optional[PathElementHandler] = None,
209 **kwargs: Any):
210 # strip off [%HDU] identifiers from e.g. DECAM Community Pipeline products
211 template = template.split('[%(')[0]
212 super().__init__(template=template, keys=keys)
213 self._translator = translatorFactory.makeMatching(datasetTypeName, keys, **kwargs)
214 self.datasetType = DatasetType(datasetTypeName, dimensions=self._translator.dimensionNames,
215 storageClass=storageClass, universe=universe,
216 isCalibration=("calibDate" in keys))
217 self._formatter = formatter
218 if targetHandler is None:
219 targetHandler = TargetFileHandler
220 self._handler = targetHandler
222 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
223 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
224 ) -> PathElementHandler:
225 # Docstring inherited from BuilderNode.
226 return self._handler(parser=parser, translator=self._translator, datasetType=self.datasetType,
227 formatter=self._formatter)
229 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
230 # Docstring inherited from BuilderNode.
231 return self, [], False
233 datasetType: DatasetType
234 """The Gen3 dataset type extracted by the handler this object builds
235 (`lsst.daf.butler.DatasetType`).
236 """
239class BuilderPrunedTree(BuilderNode):
240 """A `BuilderNode` that represents a subdirectory to be skipped,
241 created by pruning `BuilderTree` that contained only `BuilderSkipInput`
242 instances.
244 BuilderPrunedTrees can be pruned. When they are not pruned, they
245 build `SkipHandler` instances.
247 Parameters
248 ----------
249 messages : `list` [`str`]
250 A list of warning messages to be printed when the handler produced by
251 this builder matches a subdirectory.
252 """
254 def __init__(self, messages: List[str]):
255 self._messages = messages
257 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
258 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
259 ) -> PathElementHandler:
260 # Docstring inherited from BuilderNode.
261 message = "; ".join(self._messages) if self._messages else None
262 return SkipHandler(parser=parser, isForFiles=False, message=message)
264 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
265 # Docstring inherited from BuilderNode.
266 return self, self._messages, True
269class BuilderDuplicateInputs(BuilderNode):
270 """A `BuilderNode` that represents a collection of `BuilderInput` instances
271 that all have the same template.
272 """
273 def __init__(self, old: BuilderInput, new: BuilderInput):
274 self._children = []
275 if isinstance(old, BuilderDuplicateInputs):
276 self._children.extend(old._children)
277 else:
278 self._children.append(old)
279 self._children.append(new)
280 self._messages = [] # populated in prune()
282 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
283 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
284 ) -> PathElementHandler:
285 # Docstring inherited from BuilderNode.
286 message = "; ".join(self._messages) if self._messages else None
287 return SkipHandler(parser=parser, isForFiles=False, message=message)
289 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
290 # Docstring inherited from BuilderNode.
291 unprunable = []
292 newChildren = []
293 for child in self._children:
294 newChild, childMessages, toPruneChild = child.prune()
295 if toPruneChild:
296 self._messages.extend(childMessages)
297 else:
298 unprunable.append(newChild)
299 newChildren.append(newChildren)
300 self._children = newChildren
301 if len(unprunable) == 0:
302 # All children are just skips, so we can prune this node if we
303 # remember their messages.
304 return self, self._messages, True
305 elif len(unprunable) == 1 and not self._messages:
306 # Exactly one child is a target, and the others were ignored with
307 # no warning messages. Tell parent node to just use that child,
308 # so if we see any matching files, we just assume they're for that
309 # target.
310 return unprunable[0], [], False
311 else:
312 # Multiple targets or skips with messages, which means we won't
313 # know how to handle any matching files. Replace any messages we
314 # have with a single message that combines them all as well as
315 # any target dataset types that they are ambiguous with.
316 nested = [f"{c.datasetType.name} (target)" for c in unprunable]
317 nested.extend(self._messages)
318 self._messages = [f"ambiguous match: [{', '.join(nested)}]"]
319 return self, self._messages, True
322class BuilderTree(BuilderNode):
323 """A `BuilderNode` that represents a directory.
325 This is the only `BuilderNode` class that is not a leaf node. If all
326 of its children can be pruned, it is replaced by a `BuilderPrunedTree`
327 (which can then be pruned itself). It builds `SubdirectoryHandler`
328 instances when not pruned.
329 """
330 def __init__(self):
331 self._children = {} # Maps template path element to BuilderNode
333 def insert(self, level: int, leaf: BuilderInput):
334 """Insert an input leaf node into the tree, recursively constructing
335 intermediate parents in order to put it at the right level.
337 Parameters
338 ----------
339 level : `int`
340 The level ``self``is at in the larger tree, with zero the
341 repository root. The right level for the leaf is given by the
342 length of ``leaf.elements``.
343 leaf : `BuilderInput`
344 The leaf node to insert.
345 """
346 nextLevel = level + 1
347 element = leaf.elements[level]
348 if nextLevel == len(leaf.elements):
349 conflict = self._children.get(element)
350 if conflict is not None:
351 # Sadly, the Gen2 butler has some actual dataset types that
352 # use the exact same template.
353 leaf = BuilderDuplicateInputs(conflict, leaf)
354 self._children[element] = leaf
355 else:
356 child = self._children.setdefault(element, BuilderTree())
357 child.insert(nextLevel, leaf)
359 def fill(self, scanner: DirectoryScanner, allKeys: Dict[str, type], previousKeys: Dict[str, type], *,
360 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]):
361 """Fill a `DirectoryScanner` instance by recursively building all
362 child nodes.
364 Parameters
365 ----------
366 scanner : `DirectoryScanner`
367 Object to populate.
368 allKeys : `dict` [`str`, `type`]
369 Mapping from Gen2 data ID key to its value type, covering all keys
370 that could be used in any child template.
371 previousKeys : `dict` [`str`, `type`], optional
372 A dictionary containing key strings and types for Gen2 data ID keys
373 that have been extracted from previous path elements of the same
374 template.
375 fileIgnoreRegEx : `re.Pattern`, optional
376 A regular expression pattern that identifies non-dataset files that
377 can be ignored, to be applied at all levels of the directory tree.
378 dirIgnoreRegEx : `re.Pattern`, optional
379 A regular expression pattern that identifies non-dataset
380 subdirectories that can be ignored, to be applied at all levels of
381 the directory tree.
382 """
383 if fileIgnoreRegEx is not None:
384 scanner.add(IgnoreHandler(fileIgnoreRegEx, isForFiles=True))
385 if dirIgnoreRegEx is not None:
386 scanner.add(IgnoreHandler(dirIgnoreRegEx, isForFiles=False))
387 for template, child in self._children.items():
388 parser = PathElementParser(template, allKeys, previousKeys=previousKeys)
389 cumulativeKeys = previousKeys.copy()
390 cumulativeKeys.update(parser.keys)
391 scanner.add(child.build(parser, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
392 dirIgnoreRegEx=dirIgnoreRegEx))
394 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
395 # Docstring inherited from BuilderNode.
396 toPruneThis = True
397 newChildren = {}
398 messages = []
399 # Recursively prune children.
400 for template, child in list(self._children.items()):
401 newChild, childMessages, toPruneChild = child.prune()
402 newChildren[template] = newChild
403 messages.extend(childMessages)
404 if not toPruneChild:
405 toPruneThis = False
406 self._children = newChildren
407 if toPruneThis:
408 return BuilderPrunedTree(messages), messages, True
409 else:
410 return self, [], False
412 def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
413 fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
414 ) -> PathElementHandler:
415 # Docstring inherited from BuilderNode.
416 built = SubdirectoryHandler(parser)
417 self.fill(built.scanner, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
418 dirIgnoreRegEx=dirIgnoreRegEx)
419 return built