Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Interfaces and common code for recursively scanning directories for Gen2 

22dataset files. 

23 

24The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for 

25dependency reasons: `DirectoryScanner` uses the ABC, while its concrete 

26implementations use `DirectorySCanner`. 

27""" 

28from __future__ import annotations 

29 

30__all__ = ["PathElementHandler", "DirectoryScanner"] 

31 

32from abc import ABC, abstractmethod 

33import bisect 

34import os 

35from typing import ( 

36 Callable, 

37 Iterator, 

38 List, 

39 Mapping, 

40 Optional, 

41) 

42 

43from lsst.log import Log 

44from lsst.daf.butler import ( 

45 DataCoordinate, 

46 DatasetType, 

47 FileDataset, 

48) 

49 

50 

51class PathElementHandler(ABC): 

52 """An interface for objects that handle a single path element (directory or 

53 file) in a Gen2 data repository. 

54 

55 Handlers added to a `DirectoryScanner` instance, which then calls them 

56 until one succeeds when it processes each element in a directoy. 

57 """ 

58 def __init__(self): 

59 self.lastDataId2 = {} 

60 

61 __slots__ = ("lastDataId2",) 

62 

63 @abstractmethod 

64 def isForFiles(self) -> bool: 

65 """Report what kind of path element this object handlers. 

66 

67 Returns 

68 ------- 

69 Return `True` if this handler is for file entries, or `False` if it 

70 is for directories. 

71 """ 

72 raise NotImplementedError() 

73 

74 @abstractmethod 

75 def __call__(self, path: str, name: str, datasets: Mapping[DatasetType, List[FileDataset]], *, 

76 log: Log, predicate: Callable[[DataCoordinate], bool]) -> bool: 

77 """Apply the handler to a file path. 

78 

79 Parameters 

80 ---------- 

81 path : `str` 

82 Full path of the file or directory. 

83 name : `str` 

84 Local name of the file or directory within its parent directory. 

85 datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ] 

86 Dictionary that found datasets should be added to. 

87 log : `Log`, optional 

88 Log to use to report warnings and debug information. 

89 predicate : `~collections.abc.Callable` 

90 A callable taking a single `DataCoordinate` argument and returning 

91 `bool`, indicating whether that (Gen3) data ID represents one 

92 that should be included in the scan.' 

93 

94 Returns 

95 ------- 

96 matched : `bool` 

97 `True` if this handler was a match for the given path and no other 

98 handlers need to be tried on it, `False` otherwise. 

99 """ 

100 raise NotImplementedError() 

101 

102 @property 

103 @abstractmethod 

104 def rank(self) -> int: 

105 """Return a rough indication of how flexible this handler is in terms 

106 of the path element names it can match. 

107 

108 Handlers that match a constant path element should always return zero. 

109 """ 

110 raise NotImplementedError() 

111 

112 def translate(self, dataId2: dict, *, partial: bool = False, log: Log) -> Optional[DataCoordinate]: 

113 """Translate the given data ID from Gen2 to Gen3. 

114 

115 The default implementation returns `None`. Subclasses that are able 

116 to translate data IDs should override this method. 

117 

118 Parameters 

119 ---------- 

120 dataId2 : `dict` 

121 Gen2 data ID. 

122 partial : `bool`, optional 

123 If `True` (`False` is default) this is a partial data ID for some 

124 dataset, and missing keys are expected. 

125 log : log : `Log`, optional 

126 Log to use to report warnings and debug information. 

127 

128 Returns 

129 ------- 

130 dataId3 : `lsst.daf.butler.DataCoordinate` or `None` 

131 A Gen3 data ID, or `None` if this handler cannot translate data 

132 IDs. 

133 """ 

134 return None 

135 

136 def __lt__(self, other: PathElementHandler): 

137 """Handlers are sorted by rank to reduce the possibility that more 

138 flexible handlers will have a chance to match something they shouldn't. 

139 """ 

140 return self.rank < other.rank 

141 

142 lastDataId2: dict 

143 """The Gen2 data ID obtained by processing parent levels in the directory 

144 tree. 

145 

146 This attribute should be reset by calling code whenever a new parent 

147 directory is entered, before invoking `__call__`. 

148 """ 

149 

150 

151class DirectoryScanner: 

152 """An object that uses `PathElementHandler` instances to process the files 

153 and subdirectories in a directory tree. 

154 """ 

155 def __init__(self): 

156 self._files = [] 

157 self._subdirectories = [] 

158 

159 __slots__ = ("_files", "_subdirectories") 

160 

161 def add(self, handler: PathElementHandler): 

162 """Add a new handler to the scanner. 

163 

164 Parameters 

165 ---------- 

166 handler : `PathElementHandler` 

167 The handler to be added. 

168 """ 

169 if handler.isForFiles(): 

170 bisect.insort(self._files, handler) 

171 else: 

172 bisect.insort(self._subdirectories, handler) 

173 

174 def __iter__(self) -> Iterator[PathElementHandler]: 

175 """Iterate over all handlers. 

176 """ 

177 yield from self._files 

178 yield from self._subdirectories 

179 

180 def scan(self, path: str, datasets: Mapping[DatasetType, List[FileDataset]], *, 

181 log: Log, predicate: Callable[[DataCoordinate], bool]): 

182 """Process a directory. 

183 

184 Parameters 

185 ---------- 

186 path : `str` 

187 Full path to the directory to be processed. 

188 datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ] 

189 Dictionary that found datasets should be added to. 

190 log : `Log`, optional 

191 Log to use to report warnings and debug information. 

192 predicate : `~collections.abc.Callable` 

193 A callable taking a single `DataCoordinate` argument and returning 

194 `bool`, indicating whether that (Gen3) data ID represents one 

195 that should be included in the scan. 

196 """ 

197 unrecognized = [] 

198 for entry in os.scandir(path): 

199 if entry.is_file(): 

200 handlers = self._files 

201 elif entry.is_dir(): 

202 handlers = self._subdirectories 

203 else: 

204 continue 

205 for handler in handlers: 

206 if handler(entry.path, entry.name, datasets, log=log, predicate=predicate): 

207 break 

208 else: 

209 unrecognized.append(entry.name) 

210 if unrecognized: 

211 log.warn("Skipped unrecognized entries in %s: %s", path, unrecognized)