Coverage for python/lsst/pipe/base/pipeline_graph/_dataset_types.py: 47%

78 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-23 10:31 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("DatasetTypeNode",) 

24 

25import dataclasses 

26from collections.abc import Collection 

27from typing import TYPE_CHECKING, Any 

28 

29import networkx 

30from lsst.daf.butler import DatasetRef, DatasetType, DimensionGraph, Registry, StorageClass 

31from lsst.daf.butler.registry import MissingDatasetTypeError 

32 

33from ._exceptions import DuplicateOutputError 

34from ._nodes import NodeKey, NodeType 

35 

36if TYPE_CHECKING: 

37 from ._edges import ReadEdge, WriteEdge 

38 

39 

40@dataclasses.dataclass(frozen=True, eq=False) 

41class DatasetTypeNode: 

42 """A node in a pipeline graph that represents a resolved dataset type. 

43 

44 Notes 

45 ----- 

46 A dataset type node represents a common definition of the dataset type 

47 across the entire graph - it is never a component, and the storage class is 

48 the registry dataset type's storage class or (if there isn't one) the one 

49 defined by the producing task. 

50 

51 Dataset type nodes are intentionally not equality comparable, since there 

52 are many different (and useful) ways to compare these objects with no clear 

53 winner as the most obvious behavior. 

54 """ 

55 

56 dataset_type: DatasetType 

57 """Common definition of this dataset type for the graph. 

58 """ 

59 

60 is_initial_query_constraint: bool 

61 """Whether this dataset should be included as a constraint in the initial 

62 query for data IDs in QuantumGraph generation. 

63 

64 This is only `True` for dataset types that are overall regular inputs, and 

65 only if none of those input connections had ``deferQueryConstraint=True``. 

66 """ 

67 

68 is_prerequisite: bool 

69 """Whether this dataset type is a prerequisite input that must exist in 

70 the Registry before graph creation. 

71 """ 

72 

73 producing_edge: WriteEdge | None 

74 """The edge to the task that produces this dataset type.""" 

75 

76 consuming_edges: Collection[ReadEdge] 

77 """The edges to tasks that consume this dataset type.""" 

78 

79 @classmethod 

80 def _from_edges( 

81 cls, key: NodeKey, xgraph: networkx.MultiDiGraph, registry: Registry, previous: DatasetTypeNode | None 

82 ) -> DatasetTypeNode: 

83 """Construct a dataset type node from its edges. 

84 

85 Parameters 

86 ---------- 

87 key : `NodeKey` 

88 Named tuple that holds the dataset type and serves as the node 

89 object in the internal networkx graph. 

90 xgraph : `networkx.MultiDiGraph` 

91 The internal networkx graph. 

92 registry : `lsst.daf.butler.Registry` 

93 Registry client for the data repository. Only used to get 

94 dataset type definitions and the dimension universe. 

95 previous : `DatasetTypeNode` or `None` 

96 Previous node for this dataset type. 

97 

98 Returns 

99 ------- 

100 node : `DatasetTypeNode` 

101 Node consistent with all edges pointing to it and the data 

102 repository. 

103 """ 

104 try: 

105 dataset_type = registry.getDatasetType(key.name) 

106 is_registered = True 

107 except MissingDatasetTypeError: 

108 dataset_type = None 

109 is_registered = False 

110 if previous is not None and previous.dataset_type == dataset_type: 

111 # This node was already resolved (with exactly the same edges 

112 # contributing, since we clear resolutions when edges are added or 

113 # removed). The only thing that might have changed was the 

114 # definition in the registry, and it didn't. 

115 return previous 

116 is_initial_query_constraint = True 

117 is_prerequisite: bool | None = None 

118 producer: str | None = None 

119 producing_edge: WriteEdge | None = None 

120 # Iterate over the incoming edges to this node, which represent the 

121 # output connections of tasks that write this dataset type; these take 

122 # precedence over the inputs in determining the graph-wide dataset type 

123 # definition (and hence which storage class we register when using the 

124 # graph to register dataset types). There should only be one such 

125 # connection, but we won't necessarily have checked that rule until 

126 # here. As a result there can be at most one iteration of this loop. 

127 for _, _, producing_edge in xgraph.in_edges(key, data="instance"): 

128 assert producing_edge is not None, "Should only be None if we never loop." 

129 if producer is not None: 

130 raise DuplicateOutputError( 

131 f"Dataset type {key.name!r} is produced by both {producing_edge.task_label!r} " 

132 f"and {producer!r}." 

133 ) 

134 producer = producing_edge.task_label 

135 dataset_type = producing_edge._resolve_dataset_type(dataset_type, universe=registry.dimensions) 

136 is_prerequisite = False 

137 is_initial_query_constraint = False 

138 consuming_edge: ReadEdge 

139 consumers: list[str] = [] 

140 consuming_edges = list( 

141 consuming_edge for _, _, consuming_edge in xgraph.out_edges(key, data="instance") 

142 ) 

143 # Put edges that are not component datasets before any edges that are. 

144 consuming_edges.sort(key=lambda consuming_edge: consuming_edge.component is not None) 

145 for consuming_edge in consuming_edges: 

146 dataset_type, is_initial_query_constraint, is_prerequisite = consuming_edge._resolve_dataset_type( 

147 current=dataset_type, 

148 universe=registry.dimensions, 

149 is_initial_query_constraint=is_initial_query_constraint, 

150 is_prerequisite=is_prerequisite, 

151 is_registered=is_registered, 

152 producer=producer, 

153 consumers=consumers, 

154 ) 

155 consumers.append(consuming_edge.task_label) 

156 assert dataset_type is not None, "Graph structure guarantees at least one edge." 

157 assert is_prerequisite is not None, "Having at least one edge guarantees is_prerequisite is known." 

158 return DatasetTypeNode( 

159 dataset_type=dataset_type, 

160 is_initial_query_constraint=is_initial_query_constraint, 

161 is_prerequisite=is_prerequisite, 

162 producing_edge=producing_edge, 

163 consuming_edges=tuple(consuming_edges), 

164 ) 

165 

166 @property 

167 def name(self) -> str: 

168 """Name of the dataset type. 

169 

170 This is always the parent dataset type, never that of a component. 

171 """ 

172 return self.dataset_type.name 

173 

174 @property 

175 def key(self) -> NodeKey: 

176 """Key that identifies this dataset type in internal and exported 

177 networkx graphs. 

178 """ 

179 return NodeKey(NodeType.DATASET_TYPE, self.dataset_type.name) 

180 

181 @property 

182 def dimensions(self) -> DimensionGraph: 

183 """Dimensions of the dataset type.""" 

184 return self.dataset_type.dimensions 

185 

186 @property 

187 def storage_class_name(self) -> str: 

188 """String name of the storage class for this dataset type.""" 

189 return self.dataset_type.storageClass_name 

190 

191 @property 

192 def storage_class(self) -> StorageClass: 

193 """Storage class for this dataset type.""" 

194 return self.dataset_type.storageClass 

195 

196 def __repr__(self) -> str: 

197 return f"{self.name} ({self.storage_class_name}, {self.dimensions})" 

198 

199 def generalize_ref(self, ref: DatasetRef) -> DatasetRef: 

200 """Convert a `~lsst.daf.butler.DatasetRef` with the dataset type 

201 associated with some task to one with the common dataset type defined 

202 by this node. 

203 

204 Parameters 

205 ---------- 

206 ref : `lsst.daf.butler.DatasetRef` 

207 Reference whose dataset type is convertible to this node's, either 

208 because it is a component with the node's dataset type as its 

209 parent, or because it has a compatible storage class. 

210 

211 Returns 

212 ------- 

213 ref : `lsst.daf.butler.DatasetRef` 

214 Reference with exactly this node's dataset type. 

215 """ 

216 if ref.isComponent(): 

217 ref = ref.makeCompositeRef() 

218 if ref.datasetType.storageClass_name != self.dataset_type.storageClass_name: 

219 return ref.overrideStorageClass(self.dataset_type.storageClass_name) 

220 return ref 

221 

222 def _to_xgraph_state(self) -> dict[str, Any]: 

223 """Convert this node's attributes into a dictionary suitable for use 

224 in exported networkx graphs. 

225 """ 

226 return { 

227 "dataset_type": self.dataset_type, 

228 "is_initial_query_constraint": self.is_initial_query_constraint, 

229 "is_prerequisite": self.is_prerequisite, 

230 "dimensions": self.dataset_type.dimensions, 

231 "storage_class_name": self.dataset_type.storageClass_name, 

232 "bipartite": NodeType.DATASET_TYPE.bipartite, 

233 }