Coverage for python/lsst/pipe/base/pipeline_graph/_dataset_types.py: 49%

81 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-11 09:32 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("DatasetTypeNode",) 

30 

31import dataclasses 

32from collections.abc import Collection 

33from typing import TYPE_CHECKING, Any 

34 

35import networkx 

36from lsst.daf.butler import DatasetRef, DatasetType, DimensionGraph, Registry, StorageClass 

37from lsst.daf.butler.registry import MissingDatasetTypeError 

38 

39from ._exceptions import DuplicateOutputError 

40from ._nodes import NodeKey, NodeType 

41 

42if TYPE_CHECKING: 

43 from ._edges import ReadEdge, WriteEdge 

44 

45 

46@dataclasses.dataclass(frozen=True, eq=False) 

47class DatasetTypeNode: 

48 """A node in a pipeline graph that represents a resolved dataset type. 

49 

50 Notes 

51 ----- 

52 A dataset type node represents a common definition of the dataset type 

53 across the entire graph - it is never a component, and the storage class is 

54 the registry dataset type's storage class or (if there isn't one) the one 

55 defined by the producing task. 

56 

57 Dataset type nodes are intentionally not equality comparable, since there 

58 are many different (and useful) ways to compare these objects with no clear 

59 winner as the most obvious behavior. 

60 """ 

61 

62 dataset_type: DatasetType 

63 """Common definition of this dataset type for the graph. 

64 """ 

65 

66 is_initial_query_constraint: bool 

67 """Whether this dataset should be included as a constraint in the initial 

68 query for data IDs in QuantumGraph generation. 

69 

70 This is only `True` for dataset types that are overall regular inputs, and 

71 only if none of those input connections had ``deferQueryConstraint=True``. 

72 """ 

73 

74 is_prerequisite: bool 

75 """Whether this dataset type is a prerequisite input that must exist in 

76 the Registry before graph creation. 

77 """ 

78 

79 producing_edge: WriteEdge | None 

80 """The edge to the task that produces this dataset type.""" 

81 

82 consuming_edges: Collection[ReadEdge] 

83 """The edges to tasks that consume this dataset type.""" 

84 

85 @classmethod 

86 def _from_edges( 

87 cls, key: NodeKey, xgraph: networkx.MultiDiGraph, registry: Registry, previous: DatasetTypeNode | None 

88 ) -> DatasetTypeNode: 

89 """Construct a dataset type node from its edges. 

90 

91 Parameters 

92 ---------- 

93 key : `NodeKey` 

94 Named tuple that holds the dataset type and serves as the node 

95 object in the internal networkx graph. 

96 xgraph : `networkx.MultiDiGraph` 

97 The internal networkx graph. 

98 registry : `lsst.daf.butler.Registry` 

99 Registry client for the data repository. Only used to get 

100 dataset type definitions and the dimension universe. 

101 previous : `DatasetTypeNode` or `None` 

102 Previous node for this dataset type. 

103 

104 Returns 

105 ------- 

106 node : `DatasetTypeNode` 

107 Node consistent with all edges pointing to it and the data 

108 repository. 

109 """ 

110 try: 

111 dataset_type = registry.getDatasetType(key.name) 

112 is_registered = True 

113 except MissingDatasetTypeError: 

114 dataset_type = None 

115 is_registered = False 

116 if previous is not None and previous.dataset_type == dataset_type: 

117 # This node was already resolved (with exactly the same edges 

118 # contributing, since we clear resolutions when edges are added or 

119 # removed). The only thing that might have changed was the 

120 # definition in the registry, and it didn't. 

121 return previous 

122 is_initial_query_constraint = True 

123 is_prerequisite: bool | None = None 

124 producer: str | None = None 

125 producing_edge: WriteEdge | None = None 

126 # Iterate over the incoming edges to this node, which represent the 

127 # output connections of tasks that write this dataset type; these take 

128 # precedence over the inputs in determining the graph-wide dataset type 

129 # definition (and hence which storage class we register when using the 

130 # graph to register dataset types). There should only be one such 

131 # connection, but we won't necessarily have checked that rule until 

132 # here. As a result there can be at most one iteration of this loop. 

133 for _, _, producing_edge in xgraph.in_edges(key, data="instance"): 

134 assert producing_edge is not None, "Should only be None if we never loop." 

135 if producer is not None: 

136 raise DuplicateOutputError( 

137 f"Dataset type {key.name!r} is produced by both {producing_edge.task_label!r} " 

138 f"and {producer!r}." 

139 ) 

140 producer = producing_edge.task_label 

141 dataset_type = producing_edge._resolve_dataset_type(dataset_type, universe=registry.dimensions) 

142 is_prerequisite = False 

143 is_initial_query_constraint = False 

144 consuming_edge: ReadEdge 

145 consumers: list[str] = [] 

146 consuming_edges = list( 

147 consuming_edge for _, _, consuming_edge in xgraph.out_edges(key, data="instance") 

148 ) 

149 # Put edges that are not component datasets before any edges that are. 

150 consuming_edges.sort(key=lambda consuming_edge: consuming_edge.component is not None) 

151 for consuming_edge in consuming_edges: 

152 dataset_type, is_initial_query_constraint, is_prerequisite = consuming_edge._resolve_dataset_type( 

153 current=dataset_type, 

154 universe=registry.dimensions, 

155 is_initial_query_constraint=is_initial_query_constraint, 

156 is_prerequisite=is_prerequisite, 

157 is_registered=is_registered, 

158 producer=producer, 

159 consumers=consumers, 

160 ) 

161 consumers.append(consuming_edge.task_label) 

162 assert dataset_type is not None, "Graph structure guarantees at least one edge." 

163 assert is_prerequisite is not None, "Having at least one edge guarantees is_prerequisite is known." 

164 return DatasetTypeNode( 

165 dataset_type=dataset_type, 

166 is_initial_query_constraint=is_initial_query_constraint, 

167 is_prerequisite=is_prerequisite, 

168 producing_edge=producing_edge, 

169 consuming_edges=tuple(consuming_edges), 

170 ) 

171 

172 @property 

173 def name(self) -> str: 

174 """Name of the dataset type. 

175 

176 This is always the parent dataset type, never that of a component. 

177 """ 

178 return self.dataset_type.name 

179 

180 @property 

181 def key(self) -> NodeKey: 

182 """Key that identifies this dataset type in internal and exported 

183 networkx graphs. 

184 """ 

185 return NodeKey(NodeType.DATASET_TYPE, self.dataset_type.name) 

186 

187 @property 

188 def dimensions(self) -> DimensionGraph: 

189 """Dimensions of the dataset type.""" 

190 return self.dataset_type.dimensions 

191 

192 @property 

193 def storage_class_name(self) -> str: 

194 """String name of the storage class for this dataset type.""" 

195 return self.dataset_type.storageClass_name 

196 

197 @property 

198 def storage_class(self) -> StorageClass: 

199 """Storage class for this dataset type.""" 

200 return self.dataset_type.storageClass 

201 

202 @property 

203 def is_calibration(self) -> bool: 

204 """Whether this dataset type can be included in 

205 `~lsst.daf.butler.CollectionType.CALIBRATION` collections. 

206 """ 

207 return self.dataset_type.isCalibration() 

208 

209 def __repr__(self) -> str: 

210 return f"{self.name} ({self.storage_class_name}, {self.dimensions})" 

211 

212 def generalize_ref(self, ref: DatasetRef) -> DatasetRef: 

213 """Convert a `~lsst.daf.butler.DatasetRef` with the dataset type 

214 associated with some task to one with the common dataset type defined 

215 by this node. 

216 

217 Parameters 

218 ---------- 

219 ref : `lsst.daf.butler.DatasetRef` 

220 Reference whose dataset type is convertible to this node's, either 

221 because it is a component with the node's dataset type as its 

222 parent, or because it has a compatible storage class. 

223 

224 Returns 

225 ------- 

226 ref : `lsst.daf.butler.DatasetRef` 

227 Reference with exactly this node's dataset type. 

228 """ 

229 if ref.isComponent(): 

230 ref = ref.makeCompositeRef() 

231 if ref.datasetType.storageClass_name != self.dataset_type.storageClass_name: 

232 return ref.overrideStorageClass(self.dataset_type.storageClass_name) 

233 return ref 

234 

235 def _to_xgraph_state(self) -> dict[str, Any]: 

236 """Convert this node's attributes into a dictionary suitable for use 

237 in exported networkx graphs. 

238 """ 

239 return { 

240 "dataset_type": self.dataset_type, 

241 "is_initial_query_constraint": self.is_initial_query_constraint, 

242 "is_prerequisite": self.is_prerequisite, 

243 "dimensions": self.dataset_type.dimensions, 

244 "storage_class_name": self.dataset_type.storageClass_name, 

245 "bipartite": NodeType.DATASET_TYPE.bipartite, 

246 }