Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""High-level interface to the Gen2 repository-walking functionality defined 

22by this package. 

23""" 

24from __future__ import annotations 

25 

26__all__ = ["RepoWalker"] 

27 

28from collections import defaultdict 

29import re 

30from typing import ( 

31 Callable, 

32 ClassVar, 

33 Dict, 

34 Iterable, 

35 List, 

36 Mapping, 

37 Optional, 

38 Union, 

39) 

40 

41from lsst.log import Log 

42from lsst.daf.butler import ( 

43 DataCoordinate, 

44 DatasetType, 

45 FileDataset, 

46) 

47from .builders import BuilderTargetInput, BuilderSkipInput, BuilderTree 

48from .scanner import DirectoryScanner 

49 

50 

51class RepoWalker: 

52 """An object that recursively walks a Gen2 data repository tree, extracting 

53 Gen3 `FileDataset` objects and warning about unrecognized or unconvertable 

54 Gen2 datasets. 

55 

56 Parameters 

57 ---------- 

58 inputs : `~collections.abc.Iterable` of `Target` or `Skip` 

59 Structs that indicate dataset types to be extracted (`Target`) or 

60 explicitly skipped (`Skip`). Skips may include a warning message to 

61 log when matching entries are encountered. 

62 fileIgnoreRegEx : `re.Pattern`, optional 

63 A regular expression pattern that identifies non-dataset files that 

64 can be ignored, to be applied at all levels of the directory tree. 

65 dirIgnoreRegEx : `re.Pattern`, optional 

66 A regular expression pattern that identifies non-dataset subdirectories 

67 that can be ignored, to be applied at all levels of the directory tree. 

68 log : `Log`, optional 

69 Logger for warnings and diagnostic information. 

70 """ 

71 def __init__(self, inputs: Iterable[Union[Target, Skip]], *, 

72 fileIgnoreRegEx: Optional[re.Pattern] = None, 

73 dirIgnoreRegEx: Optional[re.Pattern] = None, 

74 log: Optional[Log] = None): 

75 super().__init__() 

76 if log is None: 

77 log = Log.getLogger("obs.base.gen2to3.TranslatorFactory") 

78 self.log = log 

79 tree = BuilderTree() 

80 allKeys: Dict[str, type] = {} 

81 for leaf in inputs: 

82 tree.insert(0, leaf) 

83 for key, dtype in leaf.keys.items(): 

84 if allKeys.setdefault(key, dtype) != dtype: 

85 raise ValueError(f"Multiple types for key '{key}': {dtype} " 

86 f"(from {leaf.template}) vs. {allKeys[key]}.") 

87 tree, messages, pruned = tree.prune() 

88 if not pruned: 

89 self._scanner = DirectoryScanner(log=self.log) 

90 tree.fill(self._scanner, allKeys, {}, fileIgnoreRegEx=fileIgnoreRegEx, 

91 dirIgnoreRegEx=dirIgnoreRegEx) 

92 else: 

93 # Nothing to do; just remember this for later to avoid disturbing 

94 # higher-level code with the fact that walk() will be a no-op. 

95 self._scanner = None 

96 

97 Target: ClassVar[type] = BuilderTargetInput 

98 """An input struct type whose instances represent a dataset type to be 

99 extracted (`type`). 

100 """ 

101 

102 Skip: ClassVar[type] = BuilderSkipInput 

103 """An input struct type whose instances represent a dataset type to be 

104 explicitly skipped. 

105 """ 

106 

107 def walk(self, root: str, *, predicate: Optional[Callable[[DataCoordinate], bool]] 

108 ) -> Mapping[DatasetType, List[FileDataset]]: 

109 """Walk a Gen2 repository root to extract Gen3 `FileDataset` instances 

110 from it. 

111 

112 Parameters 

113 ---------- 

114 root : `str` 

115 Absolute path to the repository root. 

116 predicate : `~collections.abc.Callable`, optional 

117 If not `None`, a callable that returns `True` if a `DataCoordinate` 

118 is consistent with what we want to extract. If ``predicate`` 

119 returns `False`, the file or directory that data ID was extracted 

120 from will not be processed, even if it includes target dataset 

121 types. 

122 

123 Returns 

124 ------- 

125 datasets : `defaultdict` [`DatasetType`, `list`[`FileDataset`]] 

126 Extracted datasets, grouped by Gen3 `DatasetType`. 

127 """ 

128 if predicate is None: 

129 def predicate(dataId: DataCoordinate) -> bool: 

130 return True 

131 datasets = defaultdict(list) 

132 if self._scanner is not None: 

133 self._scanner.scan(root, datasets, predicate=predicate) 

134 return datasets