Coverage for python/lsst/obs/base/gen2to3/repoWalker/walker.py: 33%

40 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-06-09 03:03 -0700

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""High-level interface to the Gen2 repository-walking functionality defined 

22by this package. 

23""" 

24from __future__ import annotations 

25 

26__all__ = ["RepoWalker"] 

27 

28import logging 

29import re 

30from collections import defaultdict 

31from typing import Callable, ClassVar, Dict, Iterable, List, Mapping, Optional, Union 

32 

33from lsst.daf.butler import DataCoordinate, DatasetType, FileDataset, Progress 

34 

35from .builders import BuilderSkipInput, BuilderTargetInput, BuilderTree 

36from .scanner import DirectoryScanner 

37 

38 

39class RepoWalker: 

40 """An object that recursively walks a Gen2 data repository tree, extracting 

41 Gen3 `FileDataset` objects and warning about unrecognized or unconvertable 

42 Gen2 datasets. 

43 

44 Parameters 

45 ---------- 

46 inputs : `~collections.abc.Iterable` of `Target` or `Skip` 

47 Structs that indicate dataset types to be extracted (`Target`) or 

48 explicitly skipped (`Skip`). Skips may include a warning message to 

49 log when matching entries are encountered. 

50 fileIgnoreRegEx : `re.Pattern`, optional 

51 A regular expression pattern that identifies non-dataset files that 

52 can be ignored, to be applied at all levels of the directory tree. 

53 dirIgnoreRegEx : `re.Pattern`, optional 

54 A regular expression pattern that identifies non-dataset subdirectories 

55 that can be ignored, to be applied at all levels of the directory tree. 

56 log : `Log`, optional 

57 Logger for warnings and diagnostic information. 

58 progress : `Progress`, optional 

59 Object to use to report incremental progress. 

60 """ 

61 

62 def __init__( 

63 self, 

64 inputs: Iterable[Union[Target, Skip]], 

65 *, 

66 fileIgnoreRegEx: Optional[re.Pattern] = None, 

67 dirIgnoreRegEx: Optional[re.Pattern] = None, 

68 log: Optional[logging.Logger] = None, 

69 progress: Optional[Progress] = None, 

70 ): 

71 super().__init__() 

72 if log is None: 

73 log = logging.getLogger("lsst.obs.base.gen2to3.repoWalker") 

74 self.log = log 

75 tree = BuilderTree(progress) 

76 allKeys: Dict[str, type] = {} 

77 for leaf in inputs: 

78 tree.insert(0, leaf) 

79 for key, dtype in leaf.keys.items(): 

80 if allKeys.setdefault(key, dtype) != dtype: 

81 raise ValueError( 

82 f"Multiple types for key '{key}': {dtype} " 

83 f"(from {leaf.template}) vs. {allKeys[key]}." 

84 ) 

85 tree, messages, pruned = tree.prune() 

86 if not pruned: 

87 self._scanner = DirectoryScanner(log=self.log) 

88 tree.fill( 

89 self._scanner, allKeys, {}, fileIgnoreRegEx=fileIgnoreRegEx, dirIgnoreRegEx=dirIgnoreRegEx 

90 ) 

91 else: 

92 # Nothing to do; just remember this for later to avoid disturbing 

93 # higher-level code with the fact that walk() will be a no-op. 

94 self._scanner = None 

95 

96 Target: ClassVar[type] = BuilderTargetInput 

97 """An input struct type whose instances represent a dataset type to be 

98 extracted (`type`). 

99 """ 

100 

101 Skip: ClassVar[type] = BuilderSkipInput 

102 """An input struct type whose instances represent a dataset type to be 

103 explicitly skipped. 

104 """ 

105 

106 def walk( 

107 self, root: str, *, predicate: Optional[Callable[[DataCoordinate], bool]] 

108 ) -> Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]]: 

109 """Walk a Gen2 repository root to extract Gen3 `FileDataset` instances 

110 from it. 

111 

112 Parameters 

113 ---------- 

114 root : `str` 

115 Absolute path to the repository root. 

116 predicate : `~collections.abc.Callable`, optional 

117 If not `None`, a callable that returns `True` if a `DataCoordinate` 

118 is consistent with what we want to extract. If ``predicate`` 

119 returns `False`, the file or directory that data ID was extracted 

120 from will not be processed, even if it includes target dataset 

121 types. 

122 

123 Returns 

124 ------- 

125 datasets : `defaultdict` [`DatasetType`, `defaultdict` ] 

126 Extracted datasets, grouped by Gen3 `DatasetType`. Nested dict 

127 keys are "CALIBDATE" strings (for calibration datasets) or `None` 

128 (otherwise). Nested dict values are lists of `FileDataset`. 

129 """ 

130 if predicate is None: 

131 

132 def predicate(dataId: DataCoordinate) -> bool: 

133 return True 

134 

135 datasets = defaultdict(lambda: defaultdict(list)) 

136 if self._scanner is not None: 

137 self._scanner.scan(root, datasets, predicate=predicate) 

138 return datasets