Coverage for python/lsst/obs/base/gen2to3/repoWalker/scanner.py: 34%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

71 statements  

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Interfaces and common code for recursively scanning directories for Gen2 

22dataset files. 

23 

24The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for 

25dependency reasons: `DirectoryScanner` uses the ABC, while its concrete 

26implementations use `DirectorySCanner`. 

27""" 

28from __future__ import annotations 

29 

30__all__ = ["PathElementHandler", "DirectoryScanner"] 

31 

32from abc import ABC, abstractmethod 

33import bisect 

34import logging 

35import os 

36from typing import ( 

37 Callable, 

38 Iterator, 

39 List, 

40 Mapping, 

41 Optional, 

42 Tuple, 

43) 

44 

45from lsst.daf.butler import ( 

46 DataCoordinate, 

47 DatasetType, 

48 FileDataset, 

49 Progress, 

50) 

51 

52 

53class PathElementHandler(ABC): 

54 """An interface for objects that handle a single path element (directory or 

55 file) in a Gen2 data repository. 

56 

57 Handlers are added to a `DirectoryScanner` instance, which then calls them 

58 until one succeeds when it processes each element in a directory. 

59 """ 

60 def __init__(self): 

61 self.lastDataId2 = {} 

62 

63 __slots__ = ("lastDataId2", "log") 

64 

65 @abstractmethod 

66 def isForFiles(self) -> bool: 

67 """Report what kind of path element this object handlers. 

68 

69 Returns 

70 ------- 

71 Return `True` if this handler is for file entries, or `False` if it 

72 is for directories. 

73 """ 

74 raise NotImplementedError() 

75 

76 @abstractmethod 

77 def __call__(self, path: str, name: str, 

78 datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], *, 

79 predicate: Callable[[DataCoordinate], bool]) -> bool: 

80 """Apply the handler to a file path. 

81 

82 Parameters 

83 ---------- 

84 path : `str` 

85 Full path of the file or directory. 

86 name : `str` 

87 Local name of the file or directory within its parent directory. 

88 datasets : `dict` [`DatasetType`, `dict` ] 

89 Dictionary that found datasets should be added to. Nested dicts 

90 are keyed by either `None` (for most datasets) or a `str` 

91 "CALIBDATE" for calibration datasets. 

92 predicate : `~collections.abc.Callable` 

93 A callable taking a single `DataCoordinate` argument and returning 

94 `bool`, indicating whether that (Gen3) data ID represents one 

95 that should be included in the scan.' 

96 

97 Returns 

98 ------- 

99 matched : `bool` 

100 `True` if this handler was a match for the given path and no other 

101 handlers need to be tried on it, `False` otherwise. 

102 """ 

103 raise NotImplementedError() 

104 

105 @property 

106 @abstractmethod 

107 def rank(self) -> int: 

108 """Return a rough indication of how flexible this handler is in terms 

109 of the path element names it can match. 

110 

111 Handlers that match a constant path element should always return zero. 

112 """ 

113 raise NotImplementedError() 

114 

115 def translate(self, dataId2: dict, *, partial: bool = False 

116 ) -> Tuple[Optional[DataCoordinate], Optional[str]]: 

117 """Translate the given data ID from Gen2 to Gen3. 

118 

119 The default implementation returns `None`. Subclasses that are able 

120 to translate data IDs should override this method. 

121 

122 Parameters 

123 ---------- 

124 dataId2 : `dict` 

125 Gen2 data ID. 

126 partial : `bool`, optional 

127 If `True` (`False` is default) this is a partial data ID for some 

128 dataset, and missing keys are expected. 

129 

130 Returns 

131 ------- 

132 dataId3 : `lsst.daf.butler.DataCoordinate` or `None` 

133 A Gen3 data ID, or `None` if this handler cannot translate data 

134 IDs. 

135 calibDate : `str` or `None` 

136 A Gen2 calibration "CALIBDATE" value, or `None` if there was no 

137 such value in the template. 

138 """ 

139 return None, None 

140 

141 def __lt__(self, other: PathElementHandler): 

142 """Handlers are sorted by rank to reduce the possibility that more 

143 flexible handlers will have a chance to match something they shouldn't. 

144 """ 

145 return self.rank < other.rank 

146 

147 lastDataId2: dict 

148 """The Gen2 data ID obtained by processing parent levels in the directory 

149 tree. 

150 

151 This attribute should be reset by calling code whenever a new parent 

152 directory is entered, before invoking `__call__`. 

153 """ 

154 

155 log: logging.Logger 

156 """A logger to use for all diagnostic messages (`logging.Logger`). 

157 

158 This attribute is set on a handler in `DirectoryScanner.add`; this avoids 

159 needing to forward one through all subclass constructors. 

160 """ 

161 

162 

163class DirectoryScanner: 

164 """An object that uses `PathElementHandler` instances to process the files 

165 and subdirectories in a directory tree. 

166 

167 Parameters 

168 ---------- 

169 log : `logging.Logger`, optional 

170 Log to use to report warnings and debug information. 

171 progress : `Progress`, optional 

172 Object to use to report incremental progress. 

173 """ 

174 def __init__(self, log: Optional[logging.Logger] = None, progress: Optional[Progress] = None): 

175 self._files = [] 

176 self._subdirectories = [] 

177 if log is None: 

178 log = logging.getLogger("obs.base.gen2to3.walker") 

179 self.log = log 

180 self.progress = progress 

181 

182 __slots__ = ("_files", "_subdirectories", "log", "progress") 

183 

184 def add(self, handler: PathElementHandler): 

185 """Add a new handler to the scanner. 

186 

187 Parameters 

188 ---------- 

189 handler : `PathElementHandler` 

190 The handler to be added. 

191 """ 

192 handler.log = self.log 

193 if handler.isForFiles(): 

194 bisect.insort(self._files, handler) 

195 else: 

196 bisect.insort(self._subdirectories, handler) 

197 

198 def __iter__(self) -> Iterator[PathElementHandler]: 

199 """Iterate over all handlers. 

200 """ 

201 yield from self._files 

202 yield from self._subdirectories 

203 

204 def scan(self, path: str, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], *, 

205 predicate: Callable[[DataCoordinate], bool]): 

206 """Process a directory. 

207 

208 Parameters 

209 ---------- 

210 path : `str` 

211 Full path to the directory to be processed. 

212 datasets : `dict` [`DatasetType`, `list` ] 

213 Dictionary that found datasets should be added to. Nested lists 

214 elements are tuples of `FileDataset` and an optional "CALIBDATE" 

215 `str` value (for calibration datasets only). 

216 predicate : `~collections.abc.Callable` 

217 A callable taking a single `DataCoordinate` argument and returning 

218 `bool`, indicating whether that (Gen3) data ID represents one 

219 that should be included in the scan. 

220 """ 

221 with os.scandir(path) as iterator: 

222 unrecognized = [] 

223 recognized = [] 

224 for entry in iterator: 

225 if entry.is_file(): 

226 handlers = self._files 

227 elif entry.is_dir(): 

228 handlers = self._subdirectories 

229 else: 

230 continue 

231 if self.progress is None: 

232 # No progress reporting; look for a matching handler 

233 # with an immediate depth-first search. 

234 for handler in handlers: 

235 if handler(entry.path, entry.name, datasets, predicate=predicate): 

236 break 

237 else: 

238 unrecognized.append(entry.name) 

239 else: 

240 # Caller wants progress reporting, but we won't know how 

241 # many entries we'll have until we're done scanning. So we 

242 # save them in a list and process them in together later 

243 # (essentially breadth-first search at this level). 

244 recognized.append((entry.path, entry.name, handlers)) 

245 if self.progress is not None: 

246 # Loop through the previously-recognized entries and process 

247 # them. 

248 for filepath, filename, handlers in self.progress.wrap(recognized, desc=f"Scanning {path}"): 

249 for handler in handlers: 

250 if handler(filepath, filename, datasets, predicate=predicate): 

251 break 

252 else: 

253 unrecognized.append(entry.name) 

254 if unrecognized: 

255 self.log.warning("Skipped unrecognized entries in %s: %s", path, unrecognized)