Coverage for python/lsst/obs/base/gen2to3/repoWalker/builders.py: 36%
137 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-28 02:15 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-28 02:15 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""Classes used in `RepoWalker` construction.
23The objects here form a temporary tree that is pruned and then transformed
24into a similar tree of `PathElementHandler` instances. See `BuilderNode`
25method documentation for more information.
26"""
27from __future__ import annotations
29__all__ = ["BuilderSkipInput", "BuilderTargetInput", "BuilderTree"]
31import os
32import re
33from abc import ABC, abstractmethod
34from typing import Any, Dict, List, Optional, Tuple
36from lsst.daf.butler import DatasetType, DimensionUniverse, FormatterParameter, Progress, StorageClass
38from ..translators import TranslatorFactory
39from .handlers import IgnoreHandler, SkipHandler, SubdirectoryHandler, TargetFileHandler
40from .parser import PathElementParser
41from .scanner import DirectoryScanner, PathElementHandler
44class BuilderNode(ABC):
45 """Abstract interface for nodes in the temporary tree that is used to
46 construct a `RepoWalker`.
47 """
49 @abstractmethod
50 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
51 """Attempt to prune this node and its children from the tree.
53 Returns
54 -------
55 replacement : `BuilderNode`
56 The result of recursively pruning child nodes; often just ``self``.
57 messages : `list` [`str`]
58 Warning messages that should be logged by a parent node when a
59 matching path element is encountered, if this node is pruned.
60 prune : `bool`
61 If `True`, this node may be pruned from the tree (but will not
62 necessarily be - it may correspond to a path element that should
63 be skipped with siblings that should not be).
64 """
65 raise NotImplementedError()
67 @abstractmethod
68 def build(
69 self,
70 parser: PathElementParser,
71 allKeys: Dict[str, type],
72 cumulativeKeys: Dict[str, type],
73 *,
74 fileIgnoreRegEx: Optional[re.Pattern],
75 dirIgnoreRegEx: Optional[re.Pattern],
76 ) -> PathElementHandler:
77 """Transform this node in the build tree into a corresponding
78 `PathElementHandler`, recursing to any children.
80 Must be called after `prune`.
82 Parameters
83 ----------
84 parser : `PathElementParser`
85 An object that matches the path element the new handler is
86 responsible for and extracts a (partial) Gen2 data ID from it.
87 allKeys : `dict` [`str`, `type`]
88 A mapping from Gen2 data ID key to the type of its value. Will
89 contain all keys that may be extracted by the given parser, and
90 possibly others.
91 cumulativeKeys : `dict` [`str`, `type`], optional
92 A dictionary containing key strings and types for Gen2 data ID keys
93 that have been extracted from previous path elements for this
94 template, including those extracted by ``parser``.
96 Returns
97 -------
98 handler : `PathElementHandler`
99 A new handler object.
100 """
101 raise NotImplementedError()
104class BuilderInput(BuilderNode):
105 """An intermediate base for `BuilderNode` classes that are provided as
106 direct inputs to a `RepoWalker`, and generally correspond to exactly one
107 Gen2 dataset type.
109 Parameters
110 ----------
111 template : `str`
112 The complete Gen2 template to be matched (not just the template for
113 one path element).
114 keys : `dict` [`str`, `type`]
115 A mapping from Gen2 data ID key to the type of its value.
116 """
118 def __init__(self, template: str, keys: Dict[str, type]):
119 self.template = template
120 self.keys = keys
121 self.elements = self.template.split(os.path.sep)
123 template: str
124 """The complete Gen2 template to be matched (`str`).
125 """
127 keys: Dict[str, type]
128 """A mapping from Gen2 data ID key to the type of its value
129 (`dict` [`str`, `type`]).
130 """
132 elements: List[str]
133 """The path elements (file or directory levels) of `template`
134 (`list` of `str`).
135 """
138class BuilderSkipInput(BuilderInput):
139 """An input to a `RepoWalker` that indicates that matched files should be
140 skipped, possibly with a warning message.
142 BuilderSkipInputs can be pruned. When they are not pruned, they build
143 `SkipHandler` instances.
145 Parameters
146 ----------
147 template : `str`
148 The complete Gen2 template to be matched (not just the template for
149 one path element).
150 keys : `dict` [`str`, `type`]
151 A mapping from Gen2 data ID key to the type of its value.
152 message : `str`, optional
153 If not `None`, a warning message that should be printed either when a
154 matching file is enountered or a directory that may contain such files
155 is skipped.
156 isForFiles : `bool`, optional
157 If `True` (default), this handler should be run on files. Otherwise it
158 should be run on directories.
159 """
161 def __init__(
162 self, template: str, keys: Dict[str, type], message: Optional[str] = None, *, isForFiles: bool = True
163 ):
164 super().__init__(template=template, keys=keys)
165 self._message = message
166 self._isForFiles = isForFiles
168 def build(
169 self,
170 parser: PathElementParser,
171 allKeys: Dict[str, type],
172 cumulativeKeys: Dict[str, type],
173 *,
174 fileIgnoreRegEx: Optional[re.Pattern],
175 dirIgnoreRegEx: Optional[re.Pattern],
176 ) -> PathElementHandler:
177 # Docstring inherited from BuilderNode.
178 return SkipHandler(parser=parser, isForFiles=self._isForFiles, message=self._message)
180 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
181 # Docstring inherited from BuilderNode.
182 return self, [self._message] if self._message is not None else [], True
185class BuilderTargetInput(BuilderInput):
186 """An input to a `RepoWalker` that matches files that correspond to
187 datasets that we want to extract.
189 BuilderTargetInputs can never be pruned, and always build
190 `TargetFileHandler` instances.
192 Parameters
193 ----------
194 datasetTypeName : `str`
195 Name of the dataset type.
196 template : `str`
197 Full Gen2 filename template.
198 keys : `dict` [`str`, `type`]
199 Dictionary that maps Gen2 data ID key to the type of its value.
200 storageClass : `StorageClass`
201 `StorageClass` for the Gen3 dataset type.
202 universe : `DimensionUniverse`
203 All candidate dimensions for the Gen3 dataset type.
204 formatter : `lsst.daf.butler.Formatter` or `str`, optional
205 A Gen 3 formatter class or fully-qualified name.
206 translatorFactory : `TranslatorFactory`
207 Object that can be used to construct data ID translators.
208 targetHandler : `PathElementHandler`, optional
209 Override target handler for this dataset type.
210 **kwargs:
211 Additional keyword arguments are passed to `Translator.makeMatching`,
212 in along with ``datasetTypeName`` and ``keys``.
213 """
215 def __init__(
216 self,
217 *,
218 datasetTypeName: str,
219 template: str,
220 keys: Dict[str, type],
221 storageClass: StorageClass,
222 universe: DimensionUniverse,
223 formatter: FormatterParameter,
224 translatorFactory: TranslatorFactory,
225 targetHandler: Optional[PathElementHandler] = None,
226 **kwargs: Any,
227 ):
228 # strip off [%HDU] identifiers from e.g. DECAM Community Pipeline
229 # products
230 template = template.split("[%(")[0]
231 super().__init__(template=template, keys=keys)
232 self._translator = translatorFactory.makeMatching(datasetTypeName, keys, **kwargs)
233 self.datasetType = DatasetType(
234 datasetTypeName,
235 dimensions=self._translator.dimensionNames,
236 storageClass=storageClass,
237 universe=universe,
238 isCalibration=("calibDate" in keys),
239 )
240 self._formatter = formatter
241 if targetHandler is None:
242 targetHandler = TargetFileHandler
243 self._handler = targetHandler
245 def build(
246 self,
247 parser: PathElementParser,
248 allKeys: Dict[str, type],
249 cumulativeKeys: Dict[str, type],
250 *,
251 fileIgnoreRegEx: Optional[re.Pattern],
252 dirIgnoreRegEx: Optional[re.Pattern],
253 ) -> PathElementHandler:
254 # Docstring inherited from BuilderNode.
255 return self._handler(
256 parser=parser,
257 translator=self._translator,
258 datasetType=self.datasetType,
259 formatter=self._formatter,
260 )
262 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
263 # Docstring inherited from BuilderNode.
264 return self, [], False
266 datasetType: DatasetType
267 """The Gen3 dataset type extracted by the handler this object builds
268 (`lsst.daf.butler.DatasetType`).
269 """
272class BuilderPrunedTree(BuilderNode):
273 """A `BuilderNode` that represents a subdirectory to be skipped,
274 created by pruning `BuilderTree` that contained only `BuilderSkipInput`
275 instances.
277 BuilderPrunedTrees can be pruned. When they are not pruned, they
278 build `SkipHandler` instances.
280 Parameters
281 ----------
282 messages : `list` [`str`]
283 A list of warning messages to be printed when the handler produced by
284 this builder matches a subdirectory.
285 """
287 def __init__(self, messages: List[str]):
288 self._messages = messages
290 def build(
291 self,
292 parser: PathElementParser,
293 allKeys: Dict[str, type],
294 cumulativeKeys: Dict[str, type],
295 *,
296 fileIgnoreRegEx: Optional[re.Pattern],
297 dirIgnoreRegEx: Optional[re.Pattern],
298 ) -> PathElementHandler:
299 # Docstring inherited from BuilderNode.
300 message = "; ".join(self._messages) if self._messages else None
301 return SkipHandler(parser=parser, isForFiles=False, message=message)
303 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
304 # Docstring inherited from BuilderNode.
305 return self, self._messages, True
308class BuilderDuplicateInputs(BuilderNode):
309 """A `BuilderNode` that represents a collection of `BuilderInput` instances
310 that all have the same template.
311 """
313 def __init__(self, old: BuilderInput, new: BuilderInput):
314 self._children = []
315 if isinstance(old, BuilderDuplicateInputs):
316 self._children.extend(old._children)
317 else:
318 self._children.append(old)
319 self._children.append(new)
320 self._messages = [] # populated in prune()
322 def build(
323 self,
324 parser: PathElementParser,
325 allKeys: Dict[str, type],
326 cumulativeKeys: Dict[str, type],
327 *,
328 fileIgnoreRegEx: Optional[re.Pattern],
329 dirIgnoreRegEx: Optional[re.Pattern],
330 ) -> PathElementHandler:
331 # Docstring inherited from BuilderNode.
332 message = "; ".join(self._messages) if self._messages else None
333 return SkipHandler(parser=parser, isForFiles=False, message=message)
335 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
336 # Docstring inherited from BuilderNode.
337 unprunable = []
338 newChildren = []
339 for child in self._children:
340 newChild, childMessages, toPruneChild = child.prune()
341 if toPruneChild:
342 self._messages.extend(childMessages)
343 else:
344 unprunable.append(newChild)
345 newChildren.append(newChildren)
346 self._children = newChildren
347 if len(unprunable) == 0:
348 # All children are just skips, so we can prune this node if we
349 # remember their messages.
350 return self, self._messages, True
351 elif len(unprunable) == 1 and not self._messages:
352 # Exactly one child is a target, and the others were ignored with
353 # no warning messages. Tell parent node to just use that child,
354 # so if we see any matching files, we just assume they're for that
355 # target.
356 return unprunable[0], [], False
357 else:
358 # Multiple targets or skips with messages, which means we won't
359 # know how to handle any matching files. Replace any messages we
360 # have with a single message that combines them all as well as
361 # any target dataset types that they are ambiguous with.
362 nested = [f"{c.datasetType.name} (target)" for c in unprunable]
363 nested.extend(self._messages)
364 self._messages = [f"ambiguous match: [{', '.join(nested)}]"]
365 return self, self._messages, True
368class BuilderTree(BuilderNode):
369 """A `BuilderNode` that represents a directory.
371 This is the only `BuilderNode` class that is not a leaf node. If all
372 of its children can be pruned, it is replaced by a `BuilderPrunedTree`
373 (which can then be pruned itself). It builds `SubdirectoryHandler`
374 instances when not pruned.
376 Parameters
377 ----------
378 progress : `Progress`, optional
379 Object to use to report incremental progress.
380 """
382 def __init__(self, progress: Optional[Progress] = None):
383 self._children = {} # Maps template path element to BuilderNode
384 self.progress = progress
386 MAX_PROGRESS_LEVEL: int = 2
387 """Maximum directory level at which progress bars are created.
388 """
390 def insert(self, level: int, leaf: BuilderInput):
391 """Insert an input leaf node into the tree, recursively constructing
392 intermediate parents in order to put it at the right level.
394 Parameters
395 ----------
396 level : `int`
397 The level ``self``is at in the larger tree, with zero the
398 repository root. The right level for the leaf is given by the
399 length of ``leaf.elements``.
400 leaf : `BuilderInput`
401 The leaf node to insert.
402 """
403 nextLevel = level + 1
404 element = leaf.elements[level]
405 if nextLevel == len(leaf.elements):
406 conflict = self._children.get(element)
407 if conflict is not None:
408 # Sadly, the Gen2 butler has some actual dataset types that
409 # use the exact same template.
410 leaf = BuilderDuplicateInputs(conflict, leaf)
411 self._children[element] = leaf
412 else:
413 progress = self.progress if nextLevel <= self.MAX_PROGRESS_LEVEL else None
414 child = self._children.setdefault(element, BuilderTree(progress))
415 child.insert(nextLevel, leaf)
417 def fill(
418 self,
419 scanner: DirectoryScanner,
420 allKeys: Dict[str, type],
421 previousKeys: Dict[str, type],
422 *,
423 fileIgnoreRegEx: Optional[re.Pattern],
424 dirIgnoreRegEx: Optional[re.Pattern],
425 ):
426 """Fill a `DirectoryScanner` instance by recursively building all
427 child nodes.
429 Parameters
430 ----------
431 scanner : `DirectoryScanner`
432 Object to populate.
433 allKeys : `dict` [`str`, `type`]
434 Mapping from Gen2 data ID key to its value type, covering all keys
435 that could be used in any child template.
436 previousKeys : `dict` [`str`, `type`], optional
437 A dictionary containing key strings and types for Gen2 data ID keys
438 that have been extracted from previous path elements of the same
439 template.
440 fileIgnoreRegEx : `re.Pattern`, optional
441 A regular expression pattern that identifies non-dataset files that
442 can be ignored, to be applied at all levels of the directory tree.
443 dirIgnoreRegEx : `re.Pattern`, optional
444 A regular expression pattern that identifies non-dataset
445 subdirectories that can be ignored, to be applied at all levels of
446 the directory tree.
447 """
448 if fileIgnoreRegEx is not None:
449 scanner.add(IgnoreHandler(fileIgnoreRegEx, isForFiles=True))
450 if dirIgnoreRegEx is not None:
451 scanner.add(IgnoreHandler(dirIgnoreRegEx, isForFiles=False))
452 for template, child in self._children.items():
453 parser = PathElementParser(template, allKeys, previousKeys=previousKeys)
454 cumulativeKeys = previousKeys.copy()
455 cumulativeKeys.update(parser.keys)
456 scanner.add(
457 child.build(
458 parser,
459 allKeys,
460 cumulativeKeys,
461 fileIgnoreRegEx=fileIgnoreRegEx,
462 dirIgnoreRegEx=dirIgnoreRegEx,
463 )
464 )
466 def prune(self) -> Tuple[BuilderNode, List[str], bool]:
467 # Docstring inherited from BuilderNode.
468 toPruneThis = True
469 newChildren = {}
470 messages = []
471 # Recursively prune children.
472 for template, child in list(self._children.items()):
473 newChild, childMessages, toPruneChild = child.prune()
474 newChildren[template] = newChild
475 messages.extend(childMessages)
476 if not toPruneChild:
477 toPruneThis = False
478 self._children = newChildren
479 if toPruneThis:
480 return BuilderPrunedTree(messages), messages, True
481 else:
482 return self, [], False
484 def build(
485 self,
486 parser: PathElementParser,
487 allKeys: Dict[str, type],
488 cumulativeKeys: Dict[str, type],
489 *,
490 fileIgnoreRegEx: Optional[re.Pattern],
491 dirIgnoreRegEx: Optional[re.Pattern],
492 ) -> PathElementHandler:
493 # Docstring inherited from BuilderNode.
494 built = SubdirectoryHandler(parser, progress=self.progress)
495 self.fill(
496 built.scanner,
497 allKeys,
498 cumulativeKeys,
499 fileIgnoreRegEx=fileIgnoreRegEx,
500 dirIgnoreRegEx=dirIgnoreRegEx,
501 )
502 return built