Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""High-level interface to the Gen2 repository-walking functionality defined 

22by this package. 

23""" 

24from __future__ import annotations 

25 

26__all__ = ["RepoWalker"] 

27 

28from collections import defaultdict 

29import re 

30from typing import ( 

31 Callable, 

32 ClassVar, 

33 Dict, 

34 Iterable, 

35 List, 

36 Mapping, 

37 Optional, 

38 Union, 

39) 

40 

41from lsst.log import Log 

42from lsst.daf.butler import ( 

43 DataCoordinate, 

44 DatasetType, 

45 FileDataset, 

46) 

47from .builders import BuilderTargetInput, BuilderSkipInput, BuilderTree 

48from .scanner import DirectoryScanner 

49 

50 

51class RepoWalker: 

52 """An object that recursively walks a Gen2 data repository tree, extracting 

53 Gen3 `FileDataset` objects and warning about unrecognized or unconvertable 

54 Gen2 datasets. 

55 

56 Parameters 

57 ---------- 

58 inputs : `~collections.abc.Iterable` of `Target` or `Skip` 

59 Structs that indicate dataset types to be extracted (`Target`) or 

60 explicitly skipped (`Skip`). Skips may include a warning message to 

61 log when matching entries are encountered. 

62 fileIgnoreRegEx : `re.Pattern`, optional 

63 A regular expression pattern that identifies non-dataset files that 

64 can be ignored, to be applied at all levels of the directory tree. 

65 dirIgnoreRegEx : `re.Pattern`, optional 

66 A regular expression pattern that identifies non-dataset subdirectories 

67 that can be ignored, to be applied at all levels of the directory tree. 

68 """ 

69 def __init__(self, inputs: Iterable[Union[Target, Skip]], *, 

70 fileIgnoreRegEx: Optional[re.Pattern] = None, dirIgnoreRegEx: Optional[re.Pattern] = None): 

71 super().__init__() 

72 tree = BuilderTree() 

73 allKeys: Dict[str, type] = {} 

74 for leaf in inputs: 

75 tree.insert(0, leaf) 

76 for key, dtype in leaf.keys.items(): 

77 if allKeys.setdefault(key, dtype) != dtype: 

78 raise ValueError(f"Multiple types for key '{key}': {dtype} " 

79 f"(from {leaf.template}) vs. {allKeys[key]}.") 

80 tree, messages, pruned = tree.prune() 

81 if pruned: 

82 raise RuntimeError(f"Nothing to search for after pruning skipped datasets:" 

83 f" {'; '.join(messages)}.") 

84 self._scanner = DirectoryScanner() 

85 tree.fill(self._scanner, allKeys, {}, fileIgnoreRegEx=fileIgnoreRegEx, dirIgnoreRegEx=dirIgnoreRegEx) 

86 

87 Target: ClassVar[type] = BuilderTargetInput 

88 """An input struct type whose instances represent a dataset type to be 

89 extracted (`type`). 

90 """ 

91 

92 Skip: ClassVar[type] = BuilderSkipInput 

93 """An input struct type whose instances represent a dataset type to be 

94 explicitly skipped. 

95 """ 

96 

97 def walk(self, root: str, *, log: Log, predicate: Optional[Callable[[DataCoordinate], bool]] 

98 ) -> Mapping[DatasetType, List[FileDataset]]: 

99 """Walk a Gen2 repository root to extract Gen3 `FileDataset` instances 

100 from it. 

101 

102 Parameters 

103 ---------- 

104 root : `str` 

105 Absolute path to the repository root. 

106 log : `Log` 

107 Logger for warnings and diagnostic information. 

108 predicate : `~collections.abc.Callable`, optional 

109 If not `None`, a callable that returns `True` if a `DataCoordinate` 

110 is consistent with what we want to extract. If ``predicate`` 

111 returns `False`, the file or directory that data ID was extracted 

112 from will not be processed, even if it includes target dataset 

113 types. 

114 

115 Returns 

116 ------- 

117 datasets : `defaultdict` [`DatasetType`, `list`[`FileDataset`]] 

118 Extracted datasets, grouped by Gen3 `DatasetType`. 

119 """ 

120 if predicate is None: 

121 def predicate(dataId: DataCoordinate) -> bool: 

122 return True 

123 datasets = defaultdict(list) 

124 self._scanner.scan(root, datasets, log=log, predicate=predicate) 

125 return datasets