lsst.obs.base  19.0.0-51-gb87bce2+1
scanner.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """Interfaces and common code for recursively scanning directories for Gen2
22 dataset files.
23 
24 The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for
25 dependency reasons: `DirectoryScanner` uses the ABC, while its concrete
26 implementations use `DirectorySCanner`.
27 """
28 from __future__ import annotations
29 
30 __all__ = ["PathElementHandler", "DirectoryScanner"]
31 
32 from abc import ABC, abstractmethod
33 import bisect
34 import os
35 from typing import (
36  Callable,
37  Iterator,
38  List,
39  Mapping,
40  Optional,
41 )
42 
43 from lsst.log import Log
44 from lsst.daf.butler import (
45  DataCoordinate,
46  DatasetType,
47  FileDataset,
48 )
49 
50 
51 class PathElementHandler(ABC):
52  """An interface for objects that handle a single path element (directory or
53  file) in a Gen2 data repository.
54 
55  Handlers added to a `DirectoryScanner` instance, which then calls them
56  until one succeeds when it processes each element in a directoy.
57  """
58  def __init__(self):
59  self.lastDataId2 = {}
60 
61  __slots__ = ("lastDataId2",)
62 
63  @abstractmethod
64  def isForFiles(self) -> bool:
65  """Report what kind of path element this object handlers.
66 
67  Returns
68  -------
69  Return `True` if this handler is for file entries, or `False` if it
70  is for directories.
71  """
72  raise NotImplementedError()
73 
74  @abstractmethod
75  def __call__(self, path: str, name: str, datasets: Mapping[DatasetType, List[FileDataset]], *,
76  log: Log, predicate: Callable[[DataCoordinate], bool]) -> bool:
77  """Apply the handler to a file path.
78 
79  Parameters
80  ----------
81  path : `str`
82  Full path of the file or directory.
83  name : `str`
84  Local name of the file or directory within its parent directory.
85  datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ]
86  Dictionary that found datasets should be added to.
87  log : `Log`, optional
88  Log to use to report warnings and debug information.
89  predicate : `~collections.abc.Callable`
90  A callable taking a single `DataCoordinate` argument and returning
91  `bool`, indicating whether that (Gen3) data ID represents one
92  that should be included in the scan.'
93 
94  Returns
95  -------
96  matched : `bool`
97  `True` if this handler was a match for the given path and no other
98  handlers need to be tried on it, `False` otherwise.
99  """
100  raise NotImplementedError()
101 
102  @property
103  @abstractmethod
104  def rank(self) -> int:
105  """Return a rough indication of how flexible this handler is in terms
106  of the path element names it can match.
107 
108  Handlers that match a constant path element should always return zero.
109  """
110  raise NotImplementedError()
111 
112  def translate(self, dataId2: dict, *, partial: bool = False, log: Log) -> Optional[DataCoordinate]:
113  """Translate the given data ID from Gen2 to Gen3.
114 
115  The default implementation returns `None`. Subclasses that are able
116  to translate data IDs should override this method.
117 
118  Parameters
119  ----------
120  dataId2 : `dict`
121  Gen2 data ID.
122  partial : `bool`, optional
123  If `True` (`False` is default) this is a partial data ID for some
124  dataset, and missing keys are expected.
125  log : log : `Log`, optional
126  Log to use to report warnings and debug information.
127 
128  Returns
129  -------
130  dataId3 : `lsst.daf.butler.DataCoordinate` or `None`
131  A Gen3 data ID, or `None` if this handler cannot translate data
132  IDs.
133  """
134  return None
135 
136  def __lt__(self, other: PathElementHandler):
137  """Handlers are sorted by rank to reduce the possibility that more
138  flexible handlers will have a chance to match something they shouldn't.
139  """
140  return self.rank < other.rank
141 
142  lastDataId2: dict
143  """The Gen2 data ID obtained by processing parent levels in the directory
144  tree.
145 
146  This attribute should be reset by calling code whenever a new parent
147  directory is entered, before invoking `__call__`.
148  """
149 
150 
152  """An object that uses `PathElementHandler` instances to process the files
153  and subdirectories in a directory tree.
154  """
155  def __init__(self):
156  self._files = []
157  self._subdirectories = []
158 
159  __slots__ = ("_files", "_subdirectories")
160 
161  def add(self, handler: PathElementHandler):
162  """Add a new handler to the scanner.
163 
164  Parameters
165  ----------
166  handler : `PathElementHandler`
167  The handler to be added.
168  """
169  if handler.isForFiles():
170  bisect.insort(self._files, handler)
171  else:
172  bisect.insort(self._subdirectories, handler)
173 
174  def __iter__(self) -> Iterator[PathElementHandler]:
175  """Iterate over all handlers.
176  """
177  yield from self._files
178  yield from self._subdirectories
179 
180  def scan(self, path: str, datasets: Mapping[DatasetType, List[FileDataset]], *,
181  log: Log, predicate: Callable[[DataCoordinate], bool]):
182  """Process a directory.
183 
184  Parameters
185  ----------
186  path : `str`
187  Full path to the directory to be processed.
188  datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ]
189  Dictionary that found datasets should be added to.
190  log : `Log`, optional
191  Log to use to report warnings and debug information.
192  predicate : `~collections.abc.Callable`
193  A callable taking a single `DataCoordinate` argument and returning
194  `bool`, indicating whether that (Gen3) data ID represents one
195  that should be included in the scan.
196  """
197  unrecognized = []
198  for entry in os.scandir(path):
199  if entry.is_file():
200  handlers = self._files
201  elif entry.is_dir():
202  handlers = self._subdirectories
203  else:
204  continue
205  for handler in handlers:
206  if handler(entry.path, entry.name, datasets, log=log, predicate=predicate):
207  break
208  else:
209  unrecognized.append(entry.name)
210  if unrecognized:
211  log.warn("Skipped unrecognized entries in %s: %s", path, unrecognized)
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner
Definition: scanner.py:151
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler
Definition: scanner.py:51
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.__init__
def __init__(self)
Definition: scanner.py:155
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.lastDataId2
lastDataId2
Definition: scanner.py:59
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.__init__
def __init__(self)
Definition: scanner.py:58
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.translate
Optional[DataCoordinate] translate(self, dict dataId2, *bool partial=False, Log log)
Definition: scanner.py:112
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.__lt__
def __lt__(self, PathElementHandler other)
Definition: scanner.py:136
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner._files
_files
Definition: scanner.py:156
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.__iter__
Iterator[PathElementHandler] __iter__(self)
Definition: scanner.py:174
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.__call__
bool __call__(self, str path, str name, Mapping[DatasetType, List[FileDataset]] datasets, *Log log, Callable[[DataCoordinate], bool] predicate)
Definition: scanner.py:75
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.scan
def scan(self, str path, Mapping[DatasetType, List[FileDataset]] datasets, *Log log, Callable[[DataCoordinate], bool] predicate)
Definition: scanner.py:180
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.rank
int rank(self)
Definition: scanner.py:104
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.isForFiles
bool isForFiles(self)
Definition: scanner.py:64
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.add
def add(self, PathElementHandler handler)
Definition: scanner.py:161
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner._subdirectories
_subdirectories
Definition: scanner.py:157