Coverage for python/lsst/obs/base/gen2to3/repoWalker/walker.py: 30%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""High-level interface to the Gen2 repository-walking functionality defined
22by this package.
23"""
24from __future__ import annotations
26__all__ = ["RepoWalker"]
28import logging
29import re
30from collections import defaultdict
31from typing import Callable, ClassVar, Dict, Iterable, List, Mapping, Optional, Union
33from lsst.daf.butler import DataCoordinate, DatasetType, FileDataset, Progress
35from .builders import BuilderSkipInput, BuilderTargetInput, BuilderTree
36from .scanner import DirectoryScanner
39class RepoWalker:
40 """An object that recursively walks a Gen2 data repository tree, extracting
41 Gen3 `FileDataset` objects and warning about unrecognized or unconvertable
42 Gen2 datasets.
44 Parameters
45 ----------
46 inputs : `~collections.abc.Iterable` of `Target` or `Skip`
47 Structs that indicate dataset types to be extracted (`Target`) or
48 explicitly skipped (`Skip`). Skips may include a warning message to
49 log when matching entries are encountered.
50 fileIgnoreRegEx : `re.Pattern`, optional
51 A regular expression pattern that identifies non-dataset files that
52 can be ignored, to be applied at all levels of the directory tree.
53 dirIgnoreRegEx : `re.Pattern`, optional
54 A regular expression pattern that identifies non-dataset subdirectories
55 that can be ignored, to be applied at all levels of the directory tree.
56 log : `Log`, optional
57 Logger for warnings and diagnostic information.
58 progress : `Progress`, optional
59 Object to use to report incremental progress.
60 """
62 def __init__(
63 self,
64 inputs: Iterable[Union[Target, Skip]],
65 *,
66 fileIgnoreRegEx: Optional[re.Pattern] = None,
67 dirIgnoreRegEx: Optional[re.Pattern] = None,
68 log: Optional[logging.Logger] = None,
69 progress: Optional[Progress] = None,
70 ):
71 super().__init__()
72 if log is None:
73 log = logging.getLogger("lsst.obs.base.gen2to3.repoWalker")
74 self.log = log
75 tree = BuilderTree(progress)
76 allKeys: Dict[str, type] = {}
77 for leaf in inputs:
78 tree.insert(0, leaf)
79 for key, dtype in leaf.keys.items():
80 if allKeys.setdefault(key, dtype) != dtype:
81 raise ValueError(
82 f"Multiple types for key '{key}': {dtype} "
83 f"(from {leaf.template}) vs. {allKeys[key]}."
84 )
85 tree, messages, pruned = tree.prune()
86 if not pruned:
87 self._scanner = DirectoryScanner(log=self.log)
88 tree.fill(
89 self._scanner, allKeys, {}, fileIgnoreRegEx=fileIgnoreRegEx, dirIgnoreRegEx=dirIgnoreRegEx
90 )
91 else:
92 # Nothing to do; just remember this for later to avoid disturbing
93 # higher-level code with the fact that walk() will be a no-op.
94 self._scanner = None
96 Target: ClassVar[type] = BuilderTargetInput
97 """An input struct type whose instances represent a dataset type to be
98 extracted (`type`).
99 """
101 Skip: ClassVar[type] = BuilderSkipInput
102 """An input struct type whose instances represent a dataset type to be
103 explicitly skipped.
104 """
106 def walk(
107 self, root: str, *, predicate: Optional[Callable[[DataCoordinate], bool]]
108 ) -> Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]]:
109 """Walk a Gen2 repository root to extract Gen3 `FileDataset` instances
110 from it.
112 Parameters
113 ----------
114 root : `str`
115 Absolute path to the repository root.
116 predicate : `~collections.abc.Callable`, optional
117 If not `None`, a callable that returns `True` if a `DataCoordinate`
118 is consistent with what we want to extract. If ``predicate``
119 returns `False`, the file or directory that data ID was extracted
120 from will not be processed, even if it includes target dataset
121 types.
123 Returns
124 -------
125 datasets : `defaultdict` [`DatasetType`, `defaultdict` ]
126 Extracted datasets, grouped by Gen3 `DatasetType`. Nested dict
127 keys are "CALIBDATE" strings (for calibration datasets) or `None`
128 (otherwise). Nested dict values are lists of `FileDataset`.
129 """
130 if predicate is None:
132 def predicate(dataId: DataCoordinate) -> bool:
133 return True
135 datasets = defaultdict(lambda: defaultdict(list))
136 if self._scanner is not None:
137 self._scanner.scan(root, datasets, predicate=predicate)
138 return datasets