Coverage for python/lsst/obs/base/gen2to3/repoWalker/walker.py: 28%
38 statements
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 01:53 -0800
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 01:53 -0800
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""High-level interface to the Gen2 repository-walking functionality defined
22by this package.
23"""
24from __future__ import annotations
26__all__ = ["RepoWalker"]
28from collections import defaultdict
29import logging
30import re
31from typing import (
32 Callable,
33 ClassVar,
34 Dict,
35 Iterable,
36 List,
37 Mapping,
38 Optional,
39 Union,
40)
42from lsst.daf.butler import (
43 DataCoordinate,
44 DatasetType,
45 FileDataset,
46 Progress,
47)
48from .builders import BuilderTargetInput, BuilderSkipInput, BuilderTree
49from .scanner import DirectoryScanner
52class RepoWalker:
53 """An object that recursively walks a Gen2 data repository tree, extracting
54 Gen3 `FileDataset` objects and warning about unrecognized or unconvertable
55 Gen2 datasets.
57 Parameters
58 ----------
59 inputs : `~collections.abc.Iterable` of `Target` or `Skip`
60 Structs that indicate dataset types to be extracted (`Target`) or
61 explicitly skipped (`Skip`). Skips may include a warning message to
62 log when matching entries are encountered.
63 fileIgnoreRegEx : `re.Pattern`, optional
64 A regular expression pattern that identifies non-dataset files that
65 can be ignored, to be applied at all levels of the directory tree.
66 dirIgnoreRegEx : `re.Pattern`, optional
67 A regular expression pattern that identifies non-dataset subdirectories
68 that can be ignored, to be applied at all levels of the directory tree.
69 log : `Log`, optional
70 Logger for warnings and diagnostic information.
71 progress : `Progress`, optional
72 Object to use to report incremental progress.
73 """
74 def __init__(self, inputs: Iterable[Union[Target, Skip]], *,
75 fileIgnoreRegEx: Optional[re.Pattern] = None,
76 dirIgnoreRegEx: Optional[re.Pattern] = None,
77 log: Optional[logging.Logger] = None,
78 progress: Optional[Progress] = None):
79 super().__init__()
80 if log is None:
81 log = logging.getLogger("obs.base.gen2to3.TranslatorFactory")
82 self.log = log
83 tree = BuilderTree(progress)
84 allKeys: Dict[str, type] = {}
85 for leaf in inputs:
86 tree.insert(0, leaf)
87 for key, dtype in leaf.keys.items():
88 if allKeys.setdefault(key, dtype) != dtype:
89 raise ValueError(f"Multiple types for key '{key}': {dtype} "
90 f"(from {leaf.template}) vs. {allKeys[key]}.")
91 tree, messages, pruned = tree.prune()
92 if not pruned:
93 self._scanner = DirectoryScanner(log=self.log)
94 tree.fill(self._scanner, allKeys, {}, fileIgnoreRegEx=fileIgnoreRegEx,
95 dirIgnoreRegEx=dirIgnoreRegEx)
96 else:
97 # Nothing to do; just remember this for later to avoid disturbing
98 # higher-level code with the fact that walk() will be a no-op.
99 self._scanner = None
101 Target: ClassVar[type] = BuilderTargetInput
102 """An input struct type whose instances represent a dataset type to be
103 extracted (`type`).
104 """
106 Skip: ClassVar[type] = BuilderSkipInput
107 """An input struct type whose instances represent a dataset type to be
108 explicitly skipped.
109 """
111 def walk(self, root: str, *, predicate: Optional[Callable[[DataCoordinate], bool]]
112 ) -> Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]]:
113 """Walk a Gen2 repository root to extract Gen3 `FileDataset` instances
114 from it.
116 Parameters
117 ----------
118 root : `str`
119 Absolute path to the repository root.
120 predicate : `~collections.abc.Callable`, optional
121 If not `None`, a callable that returns `True` if a `DataCoordinate`
122 is consistent with what we want to extract. If ``predicate``
123 returns `False`, the file or directory that data ID was extracted
124 from will not be processed, even if it includes target dataset
125 types.
127 Returns
128 -------
129 datasets : `defaultdict` [`DatasetType`, `defaultdict` ]
130 Extracted datasets, grouped by Gen3 `DatasetType`. Nested dict
131 keys are "CALIBDATE" strings (for calibration datasets) or `None`
132 (otherwise). Nested dict values are lists of `FileDataset`.
133 """
134 if predicate is None:
135 def predicate(dataId: DataCoordinate) -> bool:
136 return True
137 datasets = defaultdict(lambda: defaultdict(list))
138 if self._scanner is not None:
139 self._scanner.scan(root, datasets, predicate=predicate)
140 return datasets