Coverage for python/lsst/obs/base/gen2to3/repoWalker/scanner.py: 31%
71 statements
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 01:53 -0800
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 01:53 -0800
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""Interfaces and common code for recursively scanning directories for Gen2
22dataset files.
24The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for
25dependency reasons: `DirectoryScanner` uses the ABC, while its concrete
26implementations use `DirectorySCanner`.
27"""
28from __future__ import annotations
30__all__ = ["PathElementHandler", "DirectoryScanner"]
32from abc import ABC, abstractmethod
33import bisect
34import logging
35import os
36from typing import (
37 Callable,
38 Iterator,
39 List,
40 Mapping,
41 Optional,
42 Tuple,
43)
45from lsst.daf.butler import (
46 DataCoordinate,
47 DatasetType,
48 FileDataset,
49 Progress,
50)
53class PathElementHandler(ABC):
54 """An interface for objects that handle a single path element (directory or
55 file) in a Gen2 data repository.
57 Handlers are added to a `DirectoryScanner` instance, which then calls them
58 until one succeeds when it processes each element in a directory.
59 """
60 def __init__(self):
61 self.lastDataId2 = {}
63 __slots__ = ("lastDataId2", "log")
65 @abstractmethod
66 def isForFiles(self) -> bool:
67 """Report what kind of path element this object handlers.
69 Returns
70 -------
71 Return `True` if this handler is for file entries, or `False` if it
72 is for directories.
73 """
74 raise NotImplementedError()
76 @abstractmethod
77 def __call__(self, path: str, name: str,
78 datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], *,
79 predicate: Callable[[DataCoordinate], bool]) -> bool:
80 """Apply the handler to a file path.
82 Parameters
83 ----------
84 path : `str`
85 Full path of the file or directory.
86 name : `str`
87 Local name of the file or directory within its parent directory.
88 datasets : `dict` [`DatasetType`, `dict` ]
89 Dictionary that found datasets should be added to. Nested dicts
90 are keyed by either `None` (for most datasets) or a `str`
91 "CALIBDATE" for calibration datasets.
92 predicate : `~collections.abc.Callable`
93 A callable taking a single `DataCoordinate` argument and returning
94 `bool`, indicating whether that (Gen3) data ID represents one
95 that should be included in the scan.'
97 Returns
98 -------
99 matched : `bool`
100 `True` if this handler was a match for the given path and no other
101 handlers need to be tried on it, `False` otherwise.
102 """
103 raise NotImplementedError()
105 @property
106 @abstractmethod
107 def rank(self) -> int:
108 """Return a rough indication of how flexible this handler is in terms
109 of the path element names it can match.
111 Handlers that match a constant path element should always return zero.
112 """
113 raise NotImplementedError()
115 def translate(self, dataId2: dict, *, partial: bool = False
116 ) -> Tuple[Optional[DataCoordinate], Optional[str]]:
117 """Translate the given data ID from Gen2 to Gen3.
119 The default implementation returns `None`. Subclasses that are able
120 to translate data IDs should override this method.
122 Parameters
123 ----------
124 dataId2 : `dict`
125 Gen2 data ID.
126 partial : `bool`, optional
127 If `True` (`False` is default) this is a partial data ID for some
128 dataset, and missing keys are expected.
130 Returns
131 -------
132 dataId3 : `lsst.daf.butler.DataCoordinate` or `None`
133 A Gen3 data ID, or `None` if this handler cannot translate data
134 IDs.
135 calibDate : `str` or `None`
136 A Gen2 calibration "CALIBDATE" value, or `None` if there was no
137 such value in the template.
138 """
139 return None, None
141 def __lt__(self, other: PathElementHandler):
142 """Handlers are sorted by rank to reduce the possibility that more
143 flexible handlers will have a chance to match something they shouldn't.
144 """
145 return self.rank < other.rank
147 lastDataId2: dict
148 """The Gen2 data ID obtained by processing parent levels in the directory
149 tree.
151 This attribute should be reset by calling code whenever a new parent
152 directory is entered, before invoking `__call__`.
153 """
155 log: logging.Logger
156 """A logger to use for all diagnostic messages (`logging.Logger`).
158 This attribute is set on a handler in `DirectoryScanner.add`; this avoids
159 needing to forward one through all subclass constructors.
160 """
163class DirectoryScanner:
164 """An object that uses `PathElementHandler` instances to process the files
165 and subdirectories in a directory tree.
167 Parameters
168 ----------
169 log : `logging.Logger`, optional
170 Log to use to report warnings and debug information.
171 progress : `Progress`, optional
172 Object to use to report incremental progress.
173 """
174 def __init__(self, log: Optional[logging.Logger] = None, progress: Optional[Progress] = None):
175 self._files = []
176 self._subdirectories = []
177 if log is None:
178 log = logging.getLogger("obs.base.gen2to3.walker")
179 self.log = log
180 self.progress = progress
182 __slots__ = ("_files", "_subdirectories", "log", "progress")
184 def add(self, handler: PathElementHandler):
185 """Add a new handler to the scanner.
187 Parameters
188 ----------
189 handler : `PathElementHandler`
190 The handler to be added.
191 """
192 handler.log = self.log
193 if handler.isForFiles():
194 bisect.insort(self._files, handler)
195 else:
196 bisect.insort(self._subdirectories, handler)
198 def __iter__(self) -> Iterator[PathElementHandler]:
199 """Iterate over all handlers.
200 """
201 yield from self._files
202 yield from self._subdirectories
204 def scan(self, path: str, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], *,
205 predicate: Callable[[DataCoordinate], bool]):
206 """Process a directory.
208 Parameters
209 ----------
210 path : `str`
211 Full path to the directory to be processed.
212 datasets : `dict` [`DatasetType`, `list` ]
213 Dictionary that found datasets should be added to. Nested lists
214 elements are tuples of `FileDataset` and an optional "CALIBDATE"
215 `str` value (for calibration datasets only).
216 predicate : `~collections.abc.Callable`
217 A callable taking a single `DataCoordinate` argument and returning
218 `bool`, indicating whether that (Gen3) data ID represents one
219 that should be included in the scan.
220 """
221 with os.scandir(path) as iterator:
222 unrecognized = []
223 recognized = []
224 for entry in iterator:
225 if entry.is_file():
226 handlers = self._files
227 elif entry.is_dir():
228 handlers = self._subdirectories
229 else:
230 continue
231 if self.progress is None:
232 # No progress reporting; look for a matching handler
233 # with an immediate depth-first search.
234 for handler in handlers:
235 if handler(entry.path, entry.name, datasets, predicate=predicate):
236 break
237 else:
238 unrecognized.append(entry.name)
239 else:
240 # Caller wants progress reporting, but we won't know how
241 # many entries we'll have until we're done scanning. So we
242 # save them in a list and process them in together later
243 # (essentially breadth-first search at this level).
244 recognized.append((entry.path, entry.name, handlers))
245 if self.progress is not None:
246 # Loop through the previously-recognized entries and process
247 # them.
248 for filepath, filename, handlers in self.progress.wrap(recognized, desc=f"Scanning {path}"):
249 for handler in handlers:
250 if handler(filepath, filename, datasets, predicate=predicate):
251 break
252 else:
253 unrecognized.append(entry.name)
254 if unrecognized:
255 self.log.warning("Skipped unrecognized entries in %s: %s", path, unrecognized)