Coverage for python/lsst/obs/base/gen2to3/repoWalker/walker.py : 32%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""High-level interface to the Gen2 repository-walking functionality defined
22by this package.
23"""
24from __future__ import annotations
26__all__ = ["RepoWalker"]
28from collections import defaultdict
29import re
30from typing import (
31 Callable,
32 ClassVar,
33 Dict,
34 Iterable,
35 List,
36 Mapping,
37 Optional,
38 Union,
39)
41from lsst.log import Log
42from lsst.daf.butler import (
43 DataCoordinate,
44 DatasetType,
45 FileDataset,
46)
47from .builders import BuilderTargetInput, BuilderSkipInput, BuilderTree
48from .scanner import DirectoryScanner
51class RepoWalker:
52 """An object that recursively walks a Gen2 data repository tree, extracting
53 Gen3 `FileDataset` objects and warning about unrecognized or unconvertable
54 Gen2 datasets.
56 Parameters
57 ----------
58 inputs : `~collections.abc.Iterable` of `Target` or `Skip`
59 Structs that indicate dataset types to be extracted (`Target`) or
60 explicitly skipped (`Skip`). Skips may include a warning message to
61 log when matching entries are encountered.
62 fileIgnoreRegEx : `re.Pattern`, optional
63 A regular expression pattern that identifies non-dataset files that
64 can be ignored, to be applied at all levels of the directory tree.
65 dirIgnoreRegEx : `re.Pattern`, optional
66 A regular expression pattern that identifies non-dataset subdirectories
67 that can be ignored, to be applied at all levels of the directory tree.
68 """
69 def __init__(self, inputs: Iterable[Union[Target, Skip]], *,
70 fileIgnoreRegEx: Optional[re.Pattern] = None, dirIgnoreRegEx: Optional[re.Pattern] = None):
71 super().__init__()
72 tree = BuilderTree()
73 allKeys: Dict[str, type] = {}
74 for leaf in inputs:
75 tree.insert(0, leaf)
76 for key, dtype in leaf.keys.items():
77 if allKeys.setdefault(key, dtype) != dtype:
78 raise ValueError(f"Multiple types for key '{key}': {dtype} "
79 f"(from {leaf.template}) vs. {allKeys[key]}.")
80 tree, messages, pruned = tree.prune()
81 if not pruned:
82 self._scanner = DirectoryScanner()
83 tree.fill(self._scanner, allKeys, {}, fileIgnoreRegEx=fileIgnoreRegEx,
84 dirIgnoreRegEx=dirIgnoreRegEx)
85 else:
86 # Nothing to do; just remember this for later to avoid disturbing
87 # higher-level code with the fact that walk() will be a no-op.
88 self._scanner = None
90 Target: ClassVar[type] = BuilderTargetInput
91 """An input struct type whose instances represent a dataset type to be
92 extracted (`type`).
93 """
95 Skip: ClassVar[type] = BuilderSkipInput
96 """An input struct type whose instances represent a dataset type to be
97 explicitly skipped.
98 """
100 def walk(self, root: str, *, log: Log, predicate: Optional[Callable[[DataCoordinate], bool]]
101 ) -> Mapping[DatasetType, List[FileDataset]]:
102 """Walk a Gen2 repository root to extract Gen3 `FileDataset` instances
103 from it.
105 Parameters
106 ----------
107 root : `str`
108 Absolute path to the repository root.
109 log : `Log`
110 Logger for warnings and diagnostic information.
111 predicate : `~collections.abc.Callable`, optional
112 If not `None`, a callable that returns `True` if a `DataCoordinate`
113 is consistent with what we want to extract. If ``predicate``
114 returns `False`, the file or directory that data ID was extracted
115 from will not be processed, even if it includes target dataset
116 types.
118 Returns
119 -------
120 datasets : `defaultdict` [`DatasetType`, `list`[`FileDataset`]]
121 Extracted datasets, grouped by Gen3 `DatasetType`.
122 """
123 if predicate is None:
124 def predicate(dataId: DataCoordinate) -> bool:
125 return True
126 datasets = defaultdict(list)
127 if self._scanner is not None:
128 self._scanner.scan(root, datasets, log=log, predicate=predicate)
129 return datasets