Coverage for python/lsst/obs/base/gen2to3/repoWalker/scanner.py: 31%

72 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 14:44 -0800

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Interfaces and common code for recursively scanning directories for Gen2 

22dataset files. 

23 

24The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for 

25dependency reasons: `DirectoryScanner` uses the ABC, while its concrete 

26implementations use `DirectorySCanner`. 

27""" 

28from __future__ import annotations 

29 

30__all__ = ["PathElementHandler", "DirectoryScanner"] 

31 

32import bisect 

33import logging 

34import os 

35from abc import ABC, abstractmethod 

36from typing import Callable, Iterator, List, Mapping, Optional, Tuple 

37 

38from lsst.daf.butler import DataCoordinate, DatasetType, FileDataset, Progress 

39 

40 

41class PathElementHandler(ABC): 

42 """An interface for objects that handle a single path element (directory or 

43 file) in a Gen2 data repository. 

44 

45 Handlers are added to a `DirectoryScanner` instance, which then calls them 

46 until one succeeds when it processes each element in a directory. 

47 """ 

48 

49 def __init__(self): 

50 self.lastDataId2 = {} 

51 

52 __slots__ = ("lastDataId2", "log") 

53 

54 @abstractmethod 

55 def isForFiles(self) -> bool: 

56 """Report what kind of path element this object handlers. 

57 

58 Returns 

59 ------- 

60 Return `True` if this handler is for file entries, or `False` if it 

61 is for directories. 

62 """ 

63 raise NotImplementedError() 

64 

65 @abstractmethod 

66 def __call__( 

67 self, 

68 path: str, 

69 name: str, 

70 datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], 

71 *, 

72 predicate: Callable[[DataCoordinate], bool], 

73 ) -> bool: 

74 """Apply the handler to a file path. 

75 

76 Parameters 

77 ---------- 

78 path : `str` 

79 Full path of the file or directory. 

80 name : `str` 

81 Local name of the file or directory within its parent directory. 

82 datasets : `dict` [`DatasetType`, `dict` ] 

83 Dictionary that found datasets should be added to. Nested dicts 

84 are keyed by either `None` (for most datasets) or a `str` 

85 "CALIBDATE" for calibration datasets. 

86 predicate : `~collections.abc.Callable` 

87 A callable taking a single `DataCoordinate` argument and returning 

88 `bool`, indicating whether that (Gen3) data ID represents one 

89 that should be included in the scan.' 

90 

91 Returns 

92 ------- 

93 matched : `bool` 

94 `True` if this handler was a match for the given path and no other 

95 handlers need to be tried on it, `False` otherwise. 

96 """ 

97 raise NotImplementedError() 

98 

99 @property 

100 @abstractmethod 

101 def rank(self) -> int: 

102 """Return a rough indication of how flexible this handler is in terms 

103 of the path element names it can match. 

104 

105 Handlers that match a constant path element should always return zero. 

106 """ 

107 raise NotImplementedError() 

108 

109 def translate( 

110 self, dataId2: dict, *, partial: bool = False 

111 ) -> Tuple[Optional[DataCoordinate], Optional[str]]: 

112 """Translate the given data ID from Gen2 to Gen3. 

113 

114 The default implementation returns `None`. Subclasses that are able 

115 to translate data IDs should override this method. 

116 

117 Parameters 

118 ---------- 

119 dataId2 : `dict` 

120 Gen2 data ID. 

121 partial : `bool`, optional 

122 If `True` (`False` is default) this is a partial data ID for some 

123 dataset, and missing keys are expected. 

124 

125 Returns 

126 ------- 

127 dataId3 : `lsst.daf.butler.DataCoordinate` or `None` 

128 A Gen3 data ID, or `None` if this handler cannot translate data 

129 IDs. 

130 calibDate : `str` or `None` 

131 A Gen2 calibration "CALIBDATE" value, or `None` if there was no 

132 such value in the template. 

133 """ 

134 return None, None 

135 

136 def __lt__(self, other: PathElementHandler): 

137 """Handlers are sorted by rank to reduce the possibility that more 

138 flexible handlers will have a chance to match something they shouldn't. 

139 """ 

140 return self.rank < other.rank 

141 

142 lastDataId2: dict 

143 """The Gen2 data ID obtained by processing parent levels in the directory 

144 tree. 

145 

146 This attribute should be reset by calling code whenever a new parent 

147 directory is entered, before invoking `__call__`. 

148 """ 

149 

150 log: logging.Logger 

151 """A logger to use for all diagnostic messages (`logging.Logger`). 

152 

153 This attribute is set on a handler in `DirectoryScanner.add`; this avoids 

154 needing to forward one through all subclass constructors. 

155 """ 

156 

157 

158class DirectoryScanner: 

159 """An object that uses `PathElementHandler` instances to process the files 

160 and subdirectories in a directory tree. 

161 

162 Parameters 

163 ---------- 

164 log : `logging.Logger`, optional 

165 Log to use to report warnings and debug information. 

166 progress : `Progress`, optional 

167 Object to use to report incremental progress. 

168 """ 

169 

170 def __init__(self, log: Optional[logging.Logger] = None, progress: Optional[Progress] = None): 

171 self._files = [] 

172 self._subdirectories = [] 

173 if log is None: 

174 log = logging.getLogger("lsst.obs.base.gen2to3.repoWalker") 

175 self.log = log 

176 self.progress = progress 

177 

178 __slots__ = ("_files", "_subdirectories", "log", "progress") 

179 

180 def add(self, handler: PathElementHandler): 

181 """Add a new handler to the scanner. 

182 

183 Parameters 

184 ---------- 

185 handler : `PathElementHandler` 

186 The handler to be added. 

187 """ 

188 handler.log = self.log 

189 if handler.isForFiles(): 

190 bisect.insort(self._files, handler) 

191 else: 

192 bisect.insort(self._subdirectories, handler) 

193 

194 def __iter__(self) -> Iterator[PathElementHandler]: 

195 """Iterate over all handlers.""" 

196 yield from self._files 

197 yield from self._subdirectories 

198 

199 def scan( 

200 self, 

201 path: str, 

202 datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], 

203 *, 

204 predicate: Callable[[DataCoordinate], bool], 

205 ): 

206 """Process a directory. 

207 

208 Parameters 

209 ---------- 

210 path : `str` 

211 Full path to the directory to be processed. 

212 datasets : `dict` [`DatasetType`, `list` ] 

213 Dictionary that found datasets should be added to. Nested lists 

214 elements are tuples of `FileDataset` and an optional "CALIBDATE" 

215 `str` value (for calibration datasets only). 

216 predicate : `~collections.abc.Callable` 

217 A callable taking a single `DataCoordinate` argument and returning 

218 `bool`, indicating whether that (Gen3) data ID represents one 

219 that should be included in the scan. 

220 """ 

221 with os.scandir(path) as iterator: 

222 unrecognized = [] 

223 recognized = [] 

224 for entry in iterator: 

225 if entry.is_file(): 

226 handlers = self._files 

227 elif entry.is_dir(): 

228 handlers = self._subdirectories 

229 else: 

230 continue 

231 if self.progress is None: 

232 # No progress reporting; look for a matching handler 

233 # with an immediate depth-first search. 

234 for handler in handlers: 

235 if handler(entry.path, entry.name, datasets, predicate=predicate): 

236 break 

237 else: 

238 unrecognized.append(entry.name) 

239 else: 

240 # Caller wants progress reporting, but we won't know how 

241 # many entries we'll have until we're done scanning. So we 

242 # save them in a list and process them in together later 

243 # (essentially breadth-first search at this level). 

244 recognized.append((entry.path, entry.name, handlers)) 

245 if self.progress is not None: 

246 # Loop through the previously-recognized entries and process 

247 # them. 

248 for filepath, filename, handlers in self.progress.wrap(recognized, desc=f"Scanning {path}"): 

249 for handler in handlers: 

250 if handler(filepath, filename, datasets, predicate=predicate): 

251 break 

252 else: 

253 unrecognized.append(entry.name) 

254 if unrecognized: 

255 self.log.warning("Skipped unrecognized entries in %s: %s", path, unrecognized)