Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

# This file is part of obs_base. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

"""Interfaces and common code for recursively scanning directories for Gen2 

dataset files. 

 

The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for 

dependency reasons: `DirectoryScanner` uses the ABC, while its concrete 

implementations use `DirectorySCanner`. 

""" 

from __future__ import annotations 

 

__all__ = ["PathElementHandler", "DirectoryScanner"] 

 

from abc import ABC, abstractmethod 

import bisect 

import os 

from typing import ( 

Callable, 

Iterator, 

List, 

Mapping, 

Optional, 

) 

 

from lsst.log import Log 

from lsst.daf.butler import ( 

DataCoordinate, 

DatasetType, 

FileDataset, 

) 

 

 

class PathElementHandler(ABC): 

"""An interface for objects that handle a single path element (directory or 

file) in a Gen2 data repository. 

 

Handlers added to a `DirectoryScanner` instance, which then calls them 

until one succeeds when it processes each element in a directoy. 

""" 

def __init__(self): 

self.lastDataId2 = {} 

 

__slots__ = ("lastDataId2",) 

 

@abstractmethod 

def isForFiles(self) -> bool: 

"""Report what kind of path element this object handlers. 

 

Returns 

------- 

Return `True` if this handler is for file entries, or `False` if it 

is for directories. 

""" 

raise NotImplementedError() 

 

@abstractmethod 

def __call__(self, path: str, name: str, datasets: Mapping[DatasetType, List[FileDataset]], *, 

log: Log, predicate: Callable[[DataCoordinate], bool]) -> bool: 

"""Apply the handler to a file path. 

 

Parameters 

---------- 

path : `str` 

Full path of the file or directory. 

name : `str` 

Local name of the file or directory within its parent directory. 

datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ] 

Dictionary that found datasets should be added to. 

log : `Log`, optional 

Log to use to report warnings and debug information. 

predicate : `~collections.abc.Callable` 

A callable taking a single `DataCoordinate` argument and returning 

`bool`, indicating whether that (Gen3) data ID represents one 

that should be included in the scan.' 

 

Returns 

------- 

matched : `bool` 

`True` if this handler was a match for the given path and no other 

handlers need to be tried on it, `False` otherwise. 

""" 

raise NotImplementedError() 

 

@property 

@abstractmethod 

def rank(self) -> int: 

"""Return a rough indication of how flexible this handler is in terms 

of the path element names it can match. 

 

Handlers that match a constant path element should always return zero. 

""" 

raise NotImplementedError() 

 

def translate(self, dataId2: dict, *, partial: bool = False, log: Log) -> Optional[DataCoordinate]: 

"""Translate the given data ID from Gen2 to Gen3. 

 

The default implementation returns `None`. Subclasses that are able 

to translate data IDs should override this method. 

 

Parameters 

---------- 

dataId2 : `dict` 

Gen2 data ID. 

partial : `bool`, optional 

If `True` (`False` is default) this is a partial data ID for some 

dataset, and missing keys are expected. 

log : log : `Log`, optional 

Log to use to report warnings and debug information. 

 

Returns 

------- 

dataId3 : `lsst.daf.butler.DataCoordinate` or `None` 

A Gen3 data ID, or `None` if this handler cannot translate data 

IDs. 

""" 

return None 

 

def __lt__(self, other: PathElementHandler): 

"""Handlers are sorted by rank to reduce the possibility that more 

flexible handlers will have a chance to match something they shouldn't. 

""" 

return self.rank < other.rank 

 

lastDataId2: dict 

"""The Gen2 data ID obtained by processing parent levels in the directory 

tree. 

 

This attribute should be reset by calling code whenever a new parent 

directory is entered, before invoking `__call__`. 

""" 

 

 

class DirectoryScanner: 

"""An object that uses `PathElementHandler` instances to process the files 

and subdirectories in a directory tree. 

""" 

def __init__(self): 

self._files = [] 

self._subdirectories = [] 

 

__slots__ = ("_files", "_subdirectories") 

 

def add(self, handler: PathElementHandler): 

"""Add a new handler to the scanner. 

 

Parameters 

---------- 

handler : `PathElementHandler` 

The handler to be added. 

""" 

if handler.isForFiles(): 

bisect.insort(self._files, handler) 

else: 

bisect.insort(self._subdirectories, handler) 

 

def __iter__(self) -> Iterator[PathElementHandler]: 

"""Iterate over all handlers. 

""" 

yield from self._files 

yield from self._subdirectories 

 

def scan(self, path: str, datasets: Mapping[DatasetType, List[FileDataset]], *, 

log: Log, predicate: Callable[[DataCoordinate], bool]): 

"""Process a directory. 

 

Parameters 

---------- 

path : `str` 

Full path to the directory to be processed. 

datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ] 

Dictionary that found datasets should be added to. 

log : `Log`, optional 

Log to use to report warnings and debug information. 

predicate : `~collections.abc.Callable` 

A callable taking a single `DataCoordinate` argument and returning 

`bool`, indicating whether that (Gen3) data ID represents one 

that should be included in the scan. 

""" 

unrecognized = [] 

for entry in os.scandir(path): 

if entry.is_file(): 

handlers = self._files 

elif entry.is_dir(): 

handlers = self._subdirectories 

else: 

continue 

for handler in handlers: 

if handler(entry.path, entry.name, datasets, log=log, predicate=predicate): 

break 

else: 

unrecognized.append(entry.name) 

if unrecognized: 

log.warn("Skipped unrecognized entries in %s: %s", path, unrecognized)