Coverage for python/lsst/obs/base/gen2to3/repoWalker/scanner.py : 39%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""Interfaces and common code for recursively scanning directories for Gen2
22dataset files.
24The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for
25dependency reasons: `DirectoryScanner` uses the ABC, while its concrete
26implementations use `DirectorySCanner`.
27"""
28from __future__ import annotations
30__all__ = ["PathElementHandler", "DirectoryScanner"]
32from abc import ABC, abstractmethod
33import bisect
34import os
35from typing import (
36 Callable,
37 Iterator,
38 List,
39 Mapping,
40 Optional,
41)
43from lsst.log import Log
44from lsst.daf.butler import (
45 DataCoordinate,
46 DatasetType,
47 FileDataset,
48)
51class PathElementHandler(ABC):
52 """An interface for objects that handle a single path element (directory or
53 file) in a Gen2 data repository.
55 Handlers added to a `DirectoryScanner` instance, which then calls them
56 until one succeeds when it processes each element in a directoy.
57 """
58 def __init__(self):
59 self.lastDataId2 = {}
61 __slots__ = ("lastDataId2",)
63 @abstractmethod
64 def isForFiles(self) -> bool:
65 """Report what kind of path element this object handlers.
67 Returns
68 -------
69 Return `True` if this handler is for file entries, or `False` if it
70 is for directories.
71 """
72 raise NotImplementedError()
74 @abstractmethod
75 def __call__(self, path: str, name: str, datasets: Mapping[DatasetType, List[FileDataset]], *,
76 log: Log, predicate: Callable[[DataCoordinate], bool]) -> bool:
77 """Apply the handler to a file path.
79 Parameters
80 ----------
81 path : `str`
82 Full path of the file or directory.
83 name : `str`
84 Local name of the file or directory within its parent directory.
85 datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ]
86 Dictionary that found datasets should be added to.
87 log : `Log`, optional
88 Log to use to report warnings and debug information.
89 predicate : `~collections.abc.Callable`
90 A callable taking a single `DataCoordinate` argument and returning
91 `bool`, indicating whether that (Gen3) data ID represents one
92 that should be included in the scan.'
94 Returns
95 -------
96 matched : `bool`
97 `True` if this handler was a match for the given path and no other
98 handlers need to be tried on it, `False` otherwise.
99 """
100 raise NotImplementedError()
102 @property
103 @abstractmethod
104 def rank(self) -> int:
105 """Return a rough indication of how flexible this handler is in terms
106 of the path element names it can match.
108 Handlers that match a constant path element should always return zero.
109 """
110 raise NotImplementedError()
112 def translate(self, dataId2: dict, *, partial: bool = False, log: Log) -> Optional[DataCoordinate]:
113 """Translate the given data ID from Gen2 to Gen3.
115 The default implementation returns `None`. Subclasses that are able
116 to translate data IDs should override this method.
118 Parameters
119 ----------
120 dataId2 : `dict`
121 Gen2 data ID.
122 partial : `bool`, optional
123 If `True` (`False` is default) this is a partial data ID for some
124 dataset, and missing keys are expected.
125 log : log : `Log`, optional
126 Log to use to report warnings and debug information.
128 Returns
129 -------
130 dataId3 : `lsst.daf.butler.DataCoordinate` or `None`
131 A Gen3 data ID, or `None` if this handler cannot translate data
132 IDs.
133 """
134 return None
136 def __lt__(self, other: PathElementHandler):
137 """Handlers are sorted by rank to reduce the possibility that more
138 flexible handlers will have a chance to match something they shouldn't.
139 """
140 return self.rank < other.rank
142 lastDataId2: dict
143 """The Gen2 data ID obtained by processing parent levels in the directory
144 tree.
146 This attribute should be reset by calling code whenever a new parent
147 directory is entered, before invoking `__call__`.
148 """
151class DirectoryScanner:
152 """An object that uses `PathElementHandler` instances to process the files
153 and subdirectories in a directory tree.
154 """
155 def __init__(self):
156 self._files = []
157 self._subdirectories = []
159 __slots__ = ("_files", "_subdirectories")
161 def add(self, handler: PathElementHandler):
162 """Add a new handler to the scanner.
164 Parameters
165 ----------
166 handler : `PathElementHandler`
167 The handler to be added.
168 """
169 if handler.isForFiles():
170 bisect.insort(self._files, handler)
171 else:
172 bisect.insort(self._subdirectories, handler)
174 def __iter__(self) -> Iterator[PathElementHandler]:
175 """Iterate over all handlers.
176 """
177 yield from self._files
178 yield from self._subdirectories
180 def scan(self, path: str, datasets: Mapping[DatasetType, List[FileDataset]], *,
181 log: Log, predicate: Callable[[DataCoordinate], bool]):
182 """Process a directory.
184 Parameters
185 ----------
186 path : `str`
187 Full path to the directory to be processed.
188 datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ]
189 Dictionary that found datasets should be added to.
190 log : `Log`, optional
191 Log to use to report warnings and debug information.
192 predicate : `~collections.abc.Callable`
193 A callable taking a single `DataCoordinate` argument and returning
194 `bool`, indicating whether that (Gen3) data ID represents one
195 that should be included in the scan.
196 """
197 unrecognized = []
198 for entry in os.scandir(path):
199 if entry.is_file():
200 handlers = self._files
201 elif entry.is_dir():
202 handlers = self._subdirectories
203 else:
204 continue
205 for handler in handlers:
206 if handler(entry.path, entry.name, datasets, log=log, predicate=predicate):
207 break
208 else:
209 unrecognized.append(entry.name)
210 if unrecognized:
211 log.warn("Skipped unrecognized entries in %s: %s", path, unrecognized)