Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

# This file is part of obs_base. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

"""High-level interface to the Gen2 repository-walking functionality defined 

by this package. 

""" 

from __future__ import annotations 

 

__all__ = ["RepoWalker"] 

 

from collections import defaultdict 

import re 

from typing import ( 

Callable, 

ClassVar, 

Dict, 

Iterable, 

List, 

Mapping, 

Optional, 

Union, 

) 

 

from lsst.log import Log 

from lsst.daf.butler import ( 

DataCoordinate, 

DatasetType, 

FileDataset, 

) 

from .builders import BuilderTargetInput, BuilderSkipInput, BuilderTree 

from .scanner import DirectoryScanner 

 

 

class RepoWalker: 

"""An object that recursively walks a Gen2 data repository tree, extracting 

Gen3 `FileDataset` objects and warning about unrecognized or unconvertable 

Gen2 datasets. 

 

Parameters 

---------- 

inputs : `~collections.abc.Iterable` of `Target` or `Skip` 

Structs that indicate dataset types to be extracted (`Target`) or 

explicitly skipped (`Skip`). Skips may include a warning message to 

log when matching entries are encountered. 

fileIgnoreRegEx : `re.Pattern`, optional 

A regular expression pattern that identifies non-dataset files that 

can be ignored, to be applied at all levels of the directory tree. 

dirIgnoreRegEx : `re.Pattern`, optional 

A regular expression pattern that identifies non-dataset subdirectories 

that can be ignored, to be applied at all levels of the directory tree. 

""" 

def __init__(self, inputs: Iterable[Union[Target, Skip]], *, 

fileIgnoreRegEx: Optional[re.Pattern] = None, dirIgnoreRegEx: Optional[re.Pattern] = None): 

super().__init__() 

tree = BuilderTree() 

allKeys: Dict[str, type] = {} 

for leaf in inputs: 

tree.insert(0, leaf) 

for key, dtype in leaf.keys.items(): 

if allKeys.setdefault(key, dtype) != dtype: 

raise ValueError(f"Multiple types for key '{key}': {dtype} " 

f"(from {leaf.template}) vs. {allKeys[key]}.") 

tree, messages, pruned = tree.prune() 

if pruned: 

raise RuntimeError(f"Nothing to search for after pruning skipped datasets:" 

f" {'; '.join(messages)}.") 

self._scanner = DirectoryScanner() 

tree.fill(self._scanner, allKeys, {}, fileIgnoreRegEx=fileIgnoreRegEx, dirIgnoreRegEx=dirIgnoreRegEx) 

 

Target: ClassVar[type] = BuilderTargetInput 

"""An input struct type whose instances represent a dataset type to be 

extracted (`type`). 

""" 

 

Skip: ClassVar[type] = BuilderSkipInput 

"""An input struct type whose instances represent a dataset type to be 

explicitly skipped. 

""" 

 

def walk(self, root: str, *, log: Log, predicate: Optional[Callable[[DataCoordinate], bool]] 

) -> Mapping[DatasetType, List[FileDataset]]: 

"""Walk a Gen2 repository root to extract Gen3 `FileDataset` instances 

from it. 

 

Parameters 

---------- 

root : `str` 

Absolute path to the repository root. 

log : `Log` 

Logger for warnings and diagnostic information. 

predicate : `~collections.abc.Callable`, optional 

If not `None`, a callable that returns `True` if a `DataCoordinate` 

is consistent with what we want to extract. If ``predicate`` 

returns `False`, the file or directory that data ID was extracted 

from will not be processed, even if it includes target dataset 

types. 

 

Returns 

------- 

datasets : `defaultdict` [`DatasetType`, `list`[`FileDataset`]] 

Extracted datasets, grouped by Gen3 `DatasetType`. 

""" 

if predicate is None: 

def predicate(dataId: DataCoordinate) -> bool: 

return True 

datasets = defaultdict(list) 

self._scanner.scan(root, datasets, log=log, predicate=predicate) 

return datasets