Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Interfaces and common code for recursively scanning directories for Gen2 

22dataset files. 

23 

24The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for 

25dependency reasons: `DirectoryScanner` uses the ABC, while its concrete 

26implementations use `DirectorySCanner`. 

27""" 

28from __future__ import annotations 

29 

30__all__ = ["PathElementHandler", "DirectoryScanner"] 

31 

32from abc import ABC, abstractmethod 

33import bisect 

34import os 

35from typing import ( 

36 Callable, 

37 Iterator, 

38 List, 

39 Mapping, 

40 Optional, 

41 Tuple, 

42) 

43 

44from lsst.log import Log 

45from lsst.daf.butler import ( 

46 DataCoordinate, 

47 DatasetType, 

48 FileDataset, 

49) 

50 

51 

52class PathElementHandler(ABC): 

53 """An interface for objects that handle a single path element (directory or 

54 file) in a Gen2 data repository. 

55 

56 Handlers are added to a `DirectoryScanner` instance, which then calls them 

57 until one succeeds when it processes each element in a directory. 

58 """ 

59 def __init__(self): 

60 self.lastDataId2 = {} 

61 

62 __slots__ = ("lastDataId2", "log") 

63 

64 @abstractmethod 

65 def isForFiles(self) -> bool: 

66 """Report what kind of path element this object handlers. 

67 

68 Returns 

69 ------- 

70 Return `True` if this handler is for file entries, or `False` if it 

71 is for directories. 

72 """ 

73 raise NotImplementedError() 

74 

75 @abstractmethod 

76 def __call__(self, path: str, name: str, 

77 datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], *, 

78 predicate: Callable[[DataCoordinate], bool]) -> bool: 

79 """Apply the handler to a file path. 

80 

81 Parameters 

82 ---------- 

83 path : `str` 

84 Full path of the file or directory. 

85 name : `str` 

86 Local name of the file or directory within its parent directory. 

87 datasets : `dict` [`DatasetType`, `dict` ] 

88 Dictionary that found datasets should be added to. Nested dicts 

89 are keyed by either `None` (for most datasets) or a `str` 

90 "CALIBDATE" for calibration datasets. 

91 predicate : `~collections.abc.Callable` 

92 A callable taking a single `DataCoordinate` argument and returning 

93 `bool`, indicating whether that (Gen3) data ID represents one 

94 that should be included in the scan.' 

95 

96 Returns 

97 ------- 

98 matched : `bool` 

99 `True` if this handler was a match for the given path and no other 

100 handlers need to be tried on it, `False` otherwise. 

101 """ 

102 raise NotImplementedError() 

103 

104 @property 

105 @abstractmethod 

106 def rank(self) -> int: 

107 """Return a rough indication of how flexible this handler is in terms 

108 of the path element names it can match. 

109 

110 Handlers that match a constant path element should always return zero. 

111 """ 

112 raise NotImplementedError() 

113 

114 def translate(self, dataId2: dict, *, partial: bool = False 

115 ) -> Tuple[Optional[DataCoordinate], Optional[str]]: 

116 """Translate the given data ID from Gen2 to Gen3. 

117 

118 The default implementation returns `None`. Subclasses that are able 

119 to translate data IDs should override this method. 

120 

121 Parameters 

122 ---------- 

123 dataId2 : `dict` 

124 Gen2 data ID. 

125 partial : `bool`, optional 

126 If `True` (`False` is default) this is a partial data ID for some 

127 dataset, and missing keys are expected. 

128 

129 Returns 

130 ------- 

131 dataId3 : `lsst.daf.butler.DataCoordinate` or `None` 

132 A Gen3 data ID, or `None` if this handler cannot translate data 

133 IDs. 

134 calibDate : `str` or `None` 

135 A Gen2 calibration "CALIBDATE" value, or `None` if there was no 

136 such value in the template. 

137 """ 

138 return None, None 

139 

140 def __lt__(self, other: PathElementHandler): 

141 """Handlers are sorted by rank to reduce the possibility that more 

142 flexible handlers will have a chance to match something they shouldn't. 

143 """ 

144 return self.rank < other.rank 

145 

146 lastDataId2: dict 

147 """The Gen2 data ID obtained by processing parent levels in the directory 

148 tree. 

149 

150 This attribute should be reset by calling code whenever a new parent 

151 directory is entered, before invoking `__call__`. 

152 """ 

153 

154 log: Log 

155 """A logger to use for all diagnostic messages (`lsst.log.Log`). 

156 

157 This attribute is set on a handler in `DirectoryScanner.add`; this avoids 

158 needing to forward one through all subclass constructors. 

159 """ 

160 

161 

162class DirectoryScanner: 

163 """An object that uses `PathElementHandler` instances to process the files 

164 and subdirectories in a directory tree. 

165 

166 Parameters 

167 ---------- 

168 log : `Log`, optional 

169 Log to use to report warnings and debug information. 

170 """ 

171 def __init__(self, log: Optional[Log] = None): 

172 self._files = [] 

173 self._subdirectories = [] 

174 if log is None: 

175 log = Log.getLogger("obs.base.gen2to3.walker") 

176 self.log = log 

177 

178 __slots__ = ("_files", "_subdirectories", "log") 

179 

180 def add(self, handler: PathElementHandler): 

181 """Add a new handler to the scanner. 

182 

183 Parameters 

184 ---------- 

185 handler : `PathElementHandler` 

186 The handler to be added. 

187 """ 

188 handler.log = self.log 

189 if handler.isForFiles(): 

190 bisect.insort(self._files, handler) 

191 else: 

192 bisect.insort(self._subdirectories, handler) 

193 

194 def __iter__(self) -> Iterator[PathElementHandler]: 

195 """Iterate over all handlers. 

196 """ 

197 yield from self._files 

198 yield from self._subdirectories 

199 

200 def scan(self, path: str, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], *, 

201 predicate: Callable[[DataCoordinate], bool]): 

202 """Process a directory. 

203 

204 Parameters 

205 ---------- 

206 path : `str` 

207 Full path to the directory to be processed. 

208 datasets : `dict` [`DatasetType`, `list` ] 

209 Dictionary that found datasets should be added to. Nested lists 

210 elements are tuples of `FileDataset` and an optional "CALIBDATE" 

211 `str` value (for calibration datasets only). 

212 predicate : `~collections.abc.Callable` 

213 A callable taking a single `DataCoordinate` argument and returning 

214 `bool`, indicating whether that (Gen3) data ID represents one 

215 that should be included in the scan. 

216 """ 

217 unrecognized = [] 

218 for entry in os.scandir(path): 

219 if entry.is_file(): 

220 handlers = self._files 

221 elif entry.is_dir(): 

222 handlers = self._subdirectories 

223 else: 

224 continue 

225 for handler in handlers: 

226 if handler(entry.path, entry.name, datasets, predicate=predicate): 

227 break 

228 else: 

229 unrecognized.append(entry.name) 

230 if unrecognized: 

231 self.log.warn("Skipped unrecognized entries in %s: %s", path, unrecognized)