Coverage for python/lsst/obs/base/gen2to3/repoWalker/walker.py: 28%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

38 statements  

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""High-level interface to the Gen2 repository-walking functionality defined 

22by this package. 

23""" 

24from __future__ import annotations 

25 

26__all__ = ["RepoWalker"] 

27 

28from collections import defaultdict 

29import logging 

30import re 

31from typing import ( 

32 Callable, 

33 ClassVar, 

34 Dict, 

35 Iterable, 

36 List, 

37 Mapping, 

38 Optional, 

39 Union, 

40) 

41 

42from lsst.daf.butler import ( 

43 DataCoordinate, 

44 DatasetType, 

45 FileDataset, 

46 Progress, 

47) 

48from .builders import BuilderTargetInput, BuilderSkipInput, BuilderTree 

49from .scanner import DirectoryScanner 

50 

51 

52class RepoWalker: 

53 """An object that recursively walks a Gen2 data repository tree, extracting 

54 Gen3 `FileDataset` objects and warning about unrecognized or unconvertable 

55 Gen2 datasets. 

56 

57 Parameters 

58 ---------- 

59 inputs : `~collections.abc.Iterable` of `Target` or `Skip` 

60 Structs that indicate dataset types to be extracted (`Target`) or 

61 explicitly skipped (`Skip`). Skips may include a warning message to 

62 log when matching entries are encountered. 

63 fileIgnoreRegEx : `re.Pattern`, optional 

64 A regular expression pattern that identifies non-dataset files that 

65 can be ignored, to be applied at all levels of the directory tree. 

66 dirIgnoreRegEx : `re.Pattern`, optional 

67 A regular expression pattern that identifies non-dataset subdirectories 

68 that can be ignored, to be applied at all levels of the directory tree. 

69 log : `Log`, optional 

70 Logger for warnings and diagnostic information. 

71 progress : `Progress`, optional 

72 Object to use to report incremental progress. 

73 """ 

74 def __init__(self, inputs: Iterable[Union[Target, Skip]], *, 

75 fileIgnoreRegEx: Optional[re.Pattern] = None, 

76 dirIgnoreRegEx: Optional[re.Pattern] = None, 

77 log: Optional[logging.Logger] = None, 

78 progress: Optional[Progress] = None): 

79 super().__init__() 

80 if log is None: 

81 log = logging.getLogger("obs.base.gen2to3.TranslatorFactory") 

82 self.log = log 

83 tree = BuilderTree(progress) 

84 allKeys: Dict[str, type] = {} 

85 for leaf in inputs: 

86 tree.insert(0, leaf) 

87 for key, dtype in leaf.keys.items(): 

88 if allKeys.setdefault(key, dtype) != dtype: 

89 raise ValueError(f"Multiple types for key '{key}': {dtype} " 

90 f"(from {leaf.template}) vs. {allKeys[key]}.") 

91 tree, messages, pruned = tree.prune() 

92 if not pruned: 

93 self._scanner = DirectoryScanner(log=self.log) 

94 tree.fill(self._scanner, allKeys, {}, fileIgnoreRegEx=fileIgnoreRegEx, 

95 dirIgnoreRegEx=dirIgnoreRegEx) 

96 else: 

97 # Nothing to do; just remember this for later to avoid disturbing 

98 # higher-level code with the fact that walk() will be a no-op. 

99 self._scanner = None 

100 

101 Target: ClassVar[type] = BuilderTargetInput 

102 """An input struct type whose instances represent a dataset type to be 

103 extracted (`type`). 

104 """ 

105 

106 Skip: ClassVar[type] = BuilderSkipInput 

107 """An input struct type whose instances represent a dataset type to be 

108 explicitly skipped. 

109 """ 

110 

111 def walk(self, root: str, *, predicate: Optional[Callable[[DataCoordinate], bool]] 

112 ) -> Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]]: 

113 """Walk a Gen2 repository root to extract Gen3 `FileDataset` instances 

114 from it. 

115 

116 Parameters 

117 ---------- 

118 root : `str` 

119 Absolute path to the repository root. 

120 predicate : `~collections.abc.Callable`, optional 

121 If not `None`, a callable that returns `True` if a `DataCoordinate` 

122 is consistent with what we want to extract. If ``predicate`` 

123 returns `False`, the file or directory that data ID was extracted 

124 from will not be processed, even if it includes target dataset 

125 types. 

126 

127 Returns 

128 ------- 

129 datasets : `defaultdict` [`DatasetType`, `defaultdict` ] 

130 Extracted datasets, grouped by Gen3 `DatasetType`. Nested dict 

131 keys are "CALIBDATE" strings (for calibration datasets) or `None` 

132 (otherwise). Nested dict values are lists of `FileDataset`. 

133 """ 

134 if predicate is None: 

135 def predicate(dataId: DataCoordinate) -> bool: 

136 return True 

137 datasets = defaultdict(lambda: defaultdict(list)) 

138 if self._scanner is not None: 

139 self._scanner.scan(root, datasets, predicate=predicate) 

140 return datasets