Coverage for python/lsst/obs/base/gen2to3/repoWalker/scanner.py: 34%
72 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-05 02:40 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-05 02:40 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""Interfaces and common code for recursively scanning directories for Gen2
22dataset files.
24The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for
25dependency reasons: `DirectoryScanner` uses the ABC, while its concrete
26implementations use `DirectorySCanner`.
27"""
28from __future__ import annotations
30__all__ = ["PathElementHandler", "DirectoryScanner"]
32import bisect
33import logging
34import os
35from abc import ABC, abstractmethod
36from typing import Callable, Iterator, List, Mapping, Optional, Tuple
38from lsst.daf.butler import DataCoordinate, DatasetType, FileDataset, Progress
41class PathElementHandler(ABC):
42 """An interface for objects that handle a single path element (directory or
43 file) in a Gen2 data repository.
45 Handlers are added to a `DirectoryScanner` instance, which then calls them
46 until one succeeds when it processes each element in a directory.
47 """
49 def __init__(self):
50 self.lastDataId2 = {}
52 __slots__ = ("lastDataId2", "log")
54 @abstractmethod
55 def isForFiles(self) -> bool:
56 """Report what kind of path element this object handlers.
58 Returns
59 -------
60 Return `True` if this handler is for file entries, or `False` if it
61 is for directories.
62 """
63 raise NotImplementedError()
65 @abstractmethod
66 def __call__(
67 self,
68 path: str,
69 name: str,
70 datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]],
71 *,
72 predicate: Callable[[DataCoordinate], bool],
73 ) -> bool:
74 """Apply the handler to a file path.
76 Parameters
77 ----------
78 path : `str`
79 Full path of the file or directory.
80 name : `str`
81 Local name of the file or directory within its parent directory.
82 datasets : `dict` [`DatasetType`, `dict` ]
83 Dictionary that found datasets should be added to. Nested dicts
84 are keyed by either `None` (for most datasets) or a `str`
85 "CALIBDATE" for calibration datasets.
86 predicate : `~collections.abc.Callable`
87 A callable taking a single `DataCoordinate` argument and returning
88 `bool`, indicating whether that (Gen3) data ID represents one
89 that should be included in the scan.'
91 Returns
92 -------
93 matched : `bool`
94 `True` if this handler was a match for the given path and no other
95 handlers need to be tried on it, `False` otherwise.
96 """
97 raise NotImplementedError()
99 @property
100 @abstractmethod
101 def rank(self) -> int:
102 """Return a rough indication of how flexible this handler is in terms
103 of the path element names it can match.
105 Handlers that match a constant path element should always return zero.
106 """
107 raise NotImplementedError()
109 def translate(
110 self, dataId2: dict, *, partial: bool = False
111 ) -> Tuple[Optional[DataCoordinate], Optional[str]]:
112 """Translate the given data ID from Gen2 to Gen3.
114 The default implementation returns `None`. Subclasses that are able
115 to translate data IDs should override this method.
117 Parameters
118 ----------
119 dataId2 : `dict`
120 Gen2 data ID.
121 partial : `bool`, optional
122 If `True` (`False` is default) this is a partial data ID for some
123 dataset, and missing keys are expected.
125 Returns
126 -------
127 dataId3 : `lsst.daf.butler.DataCoordinate` or `None`
128 A Gen3 data ID, or `None` if this handler cannot translate data
129 IDs.
130 calibDate : `str` or `None`
131 A Gen2 calibration "CALIBDATE" value, or `None` if there was no
132 such value in the template.
133 """
134 return None, None
136 def __lt__(self, other: PathElementHandler):
137 """Handlers are sorted by rank to reduce the possibility that more
138 flexible handlers will have a chance to match something they shouldn't.
139 """
140 return self.rank < other.rank
142 lastDataId2: dict
143 """The Gen2 data ID obtained by processing parent levels in the directory
144 tree.
146 This attribute should be reset by calling code whenever a new parent
147 directory is entered, before invoking `__call__`.
148 """
150 log: logging.Logger
151 """A logger to use for all diagnostic messages (`logging.Logger`).
153 This attribute is set on a handler in `DirectoryScanner.add`; this avoids
154 needing to forward one through all subclass constructors.
155 """
158class DirectoryScanner:
159 """An object that uses `PathElementHandler` instances to process the files
160 and subdirectories in a directory tree.
162 Parameters
163 ----------
164 log : `logging.Logger`, optional
165 Log to use to report warnings and debug information.
166 progress : `Progress`, optional
167 Object to use to report incremental progress.
168 """
170 def __init__(self, log: Optional[logging.Logger] = None, progress: Optional[Progress] = None):
171 self._files = []
172 self._subdirectories = []
173 if log is None:
174 log = logging.getLogger("lsst.obs.base.gen2to3.repoWalker")
175 self.log = log
176 self.progress = progress
178 __slots__ = ("_files", "_subdirectories", "log", "progress")
180 def add(self, handler: PathElementHandler):
181 """Add a new handler to the scanner.
183 Parameters
184 ----------
185 handler : `PathElementHandler`
186 The handler to be added.
187 """
188 handler.log = self.log
189 if handler.isForFiles():
190 bisect.insort(self._files, handler)
191 else:
192 bisect.insort(self._subdirectories, handler)
194 def __iter__(self) -> Iterator[PathElementHandler]:
195 """Iterate over all handlers."""
196 yield from self._files
197 yield from self._subdirectories
199 def scan(
200 self,
201 path: str,
202 datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]],
203 *,
204 predicate: Callable[[DataCoordinate], bool],
205 ):
206 """Process a directory.
208 Parameters
209 ----------
210 path : `str`
211 Full path to the directory to be processed.
212 datasets : `dict` [`DatasetType`, `list` ]
213 Dictionary that found datasets should be added to. Nested lists
214 elements are tuples of `FileDataset` and an optional "CALIBDATE"
215 `str` value (for calibration datasets only).
216 predicate : `~collections.abc.Callable`
217 A callable taking a single `DataCoordinate` argument and returning
218 `bool`, indicating whether that (Gen3) data ID represents one
219 that should be included in the scan.
220 """
221 with os.scandir(path) as iterator:
222 unrecognized = []
223 recognized = []
224 for entry in iterator:
225 if entry.is_file():
226 handlers = self._files
227 elif entry.is_dir():
228 handlers = self._subdirectories
229 else:
230 continue
231 if self.progress is None:
232 # No progress reporting; look for a matching handler
233 # with an immediate depth-first search.
234 for handler in handlers:
235 if handler(entry.path, entry.name, datasets, predicate=predicate):
236 break
237 else:
238 unrecognized.append(entry.name)
239 else:
240 # Caller wants progress reporting, but we won't know how
241 # many entries we'll have until we're done scanning. So we
242 # save them in a list and process them in together later
243 # (essentially breadth-first search at this level).
244 recognized.append((entry.path, entry.name, handlers))
245 if self.progress is not None:
246 # Loop through the previously-recognized entries and process
247 # them.
248 for filepath, filename, handlers in self.progress.wrap(recognized, desc=f"Scanning {path}"):
249 for handler in handlers:
250 if handler(filepath, filename, datasets, predicate=predicate):
251 break
252 else:
253 unrecognized.append(entry.name)
254 if unrecognized:
255 self.log.warning("Skipped unrecognized entries in %s: %s", path, unrecognized)