Coverage for python/lsst/obs/base/gen2to3/repoWalker/scanner.py : 37%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""Interfaces and common code for recursively scanning directories for Gen2
22dataset files.
24The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for
25dependency reasons: `DirectoryScanner` uses the ABC, while its concrete
26implementations use `DirectorySCanner`.
27"""
28from __future__ import annotations
30__all__ = ["PathElementHandler", "DirectoryScanner"]
32from abc import ABC, abstractmethod
33import bisect
34import os
35from typing import (
36 Callable,
37 Iterator,
38 List,
39 Mapping,
40 Optional,
41)
43from lsst.log import Log
44from lsst.daf.butler import (
45 DataCoordinate,
46 DatasetType,
47 FileDataset,
48)
51class PathElementHandler(ABC):
52 """An interface for objects that handle a single path element (directory or
53 file) in a Gen2 data repository.
55 Handlers are added to a `DirectoryScanner` instance, which then calls them
56 until one succeeds when it processes each element in a directory.
57 """
58 def __init__(self):
59 self.lastDataId2 = {}
61 __slots__ = ("lastDataId2", "log")
63 @abstractmethod
64 def isForFiles(self) -> bool:
65 """Report what kind of path element this object handlers.
67 Returns
68 -------
69 Return `True` if this handler is for file entries, or `False` if it
70 is for directories.
71 """
72 raise NotImplementedError()
74 @abstractmethod
75 def __call__(self, path: str, name: str, datasets: Mapping[DatasetType, List[FileDataset]], *,
76 predicate: Callable[[DataCoordinate], bool]) -> bool:
77 """Apply the handler to a file path.
79 Parameters
80 ----------
81 path : `str`
82 Full path of the file or directory.
83 name : `str`
84 Local name of the file or directory within its parent directory.
85 datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ]
86 Dictionary that found datasets should be added to.
87 predicate : `~collections.abc.Callable`
88 A callable taking a single `DataCoordinate` argument and returning
89 `bool`, indicating whether that (Gen3) data ID represents one
90 that should be included in the scan.'
92 Returns
93 -------
94 matched : `bool`
95 `True` if this handler was a match for the given path and no other
96 handlers need to be tried on it, `False` otherwise.
97 """
98 raise NotImplementedError()
100 @property
101 @abstractmethod
102 def rank(self) -> int:
103 """Return a rough indication of how flexible this handler is in terms
104 of the path element names it can match.
106 Handlers that match a constant path element should always return zero.
107 """
108 raise NotImplementedError()
110 def translate(self, dataId2: dict, *, partial: bool = False) -> Optional[DataCoordinate]:
111 """Translate the given data ID from Gen2 to Gen3.
113 The default implementation returns `None`. Subclasses that are able
114 to translate data IDs should override this method.
116 Parameters
117 ----------
118 dataId2 : `dict`
119 Gen2 data ID.
120 partial : `bool`, optional
121 If `True` (`False` is default) this is a partial data ID for some
122 dataset, and missing keys are expected.
124 Returns
125 -------
126 dataId3 : `lsst.daf.butler.DataCoordinate` or `None`
127 A Gen3 data ID, or `None` if this handler cannot translate data
128 IDs.
129 """
130 return None
132 def __lt__(self, other: PathElementHandler):
133 """Handlers are sorted by rank to reduce the possibility that more
134 flexible handlers will have a chance to match something they shouldn't.
135 """
136 return self.rank < other.rank
138 lastDataId2: dict
139 """The Gen2 data ID obtained by processing parent levels in the directory
140 tree.
142 This attribute should be reset by calling code whenever a new parent
143 directory is entered, before invoking `__call__`.
144 """
146 log: Log
147 """A logger to use for all diagnostic messages (`lsst.log.Log`).
149 This attribute is set on a handler in `DirectoryScanner.add`; this avoids
150 needing to forward one through all subclass constructors.
151 """
154class DirectoryScanner:
155 """An object that uses `PathElementHandler` instances to process the files
156 and subdirectories in a directory tree.
158 Parameters
159 ----------
160 log : `Log`, optional
161 Log to use to report warnings and debug information.
162 """
163 def __init__(self, log: Optional[Log] = None):
164 self._files = []
165 self._subdirectories = []
166 if log is None:
167 log = Log.getLogger("obs.base.gen2to3.walker")
168 self.log = log
170 __slots__ = ("_files", "_subdirectories", "log")
172 def add(self, handler: PathElementHandler):
173 """Add a new handler to the scanner.
175 Parameters
176 ----------
177 handler : `PathElementHandler`
178 The handler to be added.
179 """
180 handler.log = self.log
181 if handler.isForFiles():
182 bisect.insort(self._files, handler)
183 else:
184 bisect.insort(self._subdirectories, handler)
186 def __iter__(self) -> Iterator[PathElementHandler]:
187 """Iterate over all handlers.
188 """
189 yield from self._files
190 yield from self._subdirectories
192 def scan(self, path: str, datasets: Mapping[DatasetType, List[FileDataset]], *,
193 predicate: Callable[[DataCoordinate], bool]):
194 """Process a directory.
196 Parameters
197 ----------
198 path : `str`
199 Full path to the directory to be processed.
200 datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ]
201 Dictionary that found datasets should be added to.
202 predicate : `~collections.abc.Callable`
203 A callable taking a single `DataCoordinate` argument and returning
204 `bool`, indicating whether that (Gen3) data ID represents one
205 that should be included in the scan.
206 """
207 unrecognized = []
208 for entry in os.scandir(path):
209 if entry.is_file():
210 handlers = self._files
211 elif entry.is_dir():
212 handlers = self._subdirectories
213 else:
214 continue
215 for handler in handlers:
216 if handler(entry.path, entry.name, datasets, predicate=predicate):
217 break
218 else:
219 unrecognized.append(entry.name)
220 if unrecognized:
221 self.log.warn("Skipped unrecognized entries in %s: %s", path, unrecognized)