Coverage for python/lsst/pipe/base/pipeline_graph/_dataset_types.py: 50%

76 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-17 02:45 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("DatasetTypeNode",) 

30 

31import dataclasses 

32from collections.abc import Callable, Collection 

33from typing import TYPE_CHECKING, Any 

34 

35import networkx 

36from lsst.daf.butler import DatasetRef, DatasetType, DimensionGroup, DimensionUniverse, StorageClass 

37 

38from ._exceptions import DuplicateOutputError 

39from ._nodes import NodeKey, NodeType 

40 

41if TYPE_CHECKING: 

42 from ._edges import ReadEdge, WriteEdge 

43 

44 

45@dataclasses.dataclass(frozen=True, eq=False) 

46class DatasetTypeNode: 

47 """A node in a pipeline graph that represents a resolved dataset type. 

48 

49 Notes 

50 ----- 

51 A dataset type node represents a common definition of the dataset type 

52 across the entire graph - it is never a component, and the storage class is 

53 the registry dataset type's storage class or (if there isn't one) the one 

54 defined by the producing task. 

55 

56 Dataset type nodes are intentionally not equality comparable, since there 

57 are many different (and useful) ways to compare these objects with no clear 

58 winner as the most obvious behavior. 

59 """ 

60 

61 dataset_type: DatasetType 

62 """Common definition of this dataset type for the graph. 

63 """ 

64 

65 is_initial_query_constraint: bool 

66 """Whether this dataset should be included as a constraint in the initial 

67 query for data IDs in QuantumGraph generation. 

68 

69 This is only `True` for dataset types that are overall regular inputs, and 

70 only if none of those input connections had ``deferQueryConstraint=True``. 

71 """ 

72 

73 is_prerequisite: bool 

74 """Whether this dataset type is a prerequisite input that must exist in 

75 the Registry before graph creation. 

76 """ 

77 

78 producing_edge: WriteEdge | None 

79 """The edge to the task that produces this dataset type.""" 

80 

81 consuming_edges: Collection[ReadEdge] 

82 """The edges to tasks that consume this dataset type.""" 

83 

84 @classmethod 

85 def _from_edges( 

86 cls, 

87 key: NodeKey, 

88 xgraph: networkx.MultiDiGraph, 

89 get_registered: Callable[[str], DatasetType | None], 

90 dimensions: DimensionUniverse, 

91 previous: DatasetTypeNode | None, 

92 ) -> DatasetTypeNode: 

93 """Construct a dataset type node from its edges. 

94 

95 Parameters 

96 ---------- 

97 key : `NodeKey` 

98 Named tuple that holds the dataset type and serves as the node 

99 object in the internal networkx graph. 

100 xgraph : `networkx.MultiDiGraph` 

101 The internal networkx graph. 

102 get_registered : `~collections.abc.Callable` 

103 Callable that takes a dataset type name and returns the 

104 `DatasetType` registered in the data repository, or `None` if it is 

105 not registered. 

106 dimensions : `lsst.daf.butler.DimensionUniverse` 

107 Definitions of all dimensions. 

108 previous : `DatasetTypeNode` or `None` 

109 Previous node for this dataset type. 

110 

111 Returns 

112 ------- 

113 node : `DatasetTypeNode` 

114 Node consistent with all edges pointing to it and the data 

115 repository. 

116 """ 

117 dataset_type = get_registered(key.name) 

118 is_registered = dataset_type is not None 

119 if previous is not None and previous.dataset_type == dataset_type: 

120 # This node was already resolved (with exactly the same edges 

121 # contributing, since we clear resolutions when edges are added or 

122 # removed). The only thing that might have changed was the 

123 # definition in the registry, and it didn't. 

124 return previous 

125 is_initial_query_constraint = True 

126 is_prerequisite: bool | None = None 

127 producer: str | None = None 

128 producing_edge: WriteEdge | None = None 

129 # Iterate over the incoming edges to this node, which represent the 

130 # output connections of tasks that write this dataset type; these take 

131 # precedence over the inputs in determining the graph-wide dataset type 

132 # definition (and hence which storage class we register when using the 

133 # graph to register dataset types). There should only be one such 

134 # connection, but we won't necessarily have checked that rule until 

135 # here. As a result there can be at most one iteration of this loop. 

136 for _, _, producing_edge in xgraph.in_edges(key, data="instance"): 

137 assert producing_edge is not None, "Should only be None if we never loop." 

138 if producer is not None: 

139 raise DuplicateOutputError( 

140 f"Dataset type {key.name!r} is produced by both {producing_edge.task_label!r} " 

141 f"and {producer!r}." 

142 ) 

143 producer = producing_edge.task_label 

144 dataset_type = producing_edge._resolve_dataset_type(dataset_type, universe=dimensions) 

145 is_prerequisite = False 

146 is_initial_query_constraint = False 

147 consuming_edge: ReadEdge 

148 consumers: list[str] = [] 

149 consuming_edges = list( 

150 consuming_edge for _, _, consuming_edge in xgraph.out_edges(key, data="instance") 

151 ) 

152 # Put edges that are not component datasets before any edges that are. 

153 consuming_edges.sort(key=lambda consuming_edge: consuming_edge.component is not None) 

154 for consuming_edge in consuming_edges: 

155 dataset_type, is_initial_query_constraint, is_prerequisite = consuming_edge._resolve_dataset_type( 

156 current=dataset_type, 

157 universe=dimensions, 

158 is_initial_query_constraint=is_initial_query_constraint, 

159 is_prerequisite=is_prerequisite, 

160 is_registered=is_registered, 

161 producer=producer, 

162 consumers=consumers, 

163 ) 

164 consumers.append(consuming_edge.task_label) 

165 assert dataset_type is not None, "Graph structure guarantees at least one edge." 

166 assert is_prerequisite is not None, "Having at least one edge guarantees is_prerequisite is known." 

167 return DatasetTypeNode( 

168 dataset_type=dataset_type, 

169 is_initial_query_constraint=is_initial_query_constraint, 

170 is_prerequisite=is_prerequisite, 

171 producing_edge=producing_edge, 

172 consuming_edges=tuple(consuming_edges), 

173 ) 

174 

175 @property 

176 def name(self) -> str: 

177 """Name of the dataset type. 

178 

179 This is always the parent dataset type, never that of a component. 

180 """ 

181 return self.dataset_type.name 

182 

183 @property 

184 def key(self) -> NodeKey: 

185 """Key that identifies this dataset type in internal and exported 

186 networkx graphs. 

187 """ 

188 return NodeKey(NodeType.DATASET_TYPE, self.dataset_type.name) 

189 

190 @property 

191 def dimensions(self) -> DimensionGroup: 

192 """Dimensions of the dataset type.""" 

193 return self.dataset_type.dimensions.as_group() 

194 

195 @property 

196 def storage_class_name(self) -> str: 

197 """String name of the storage class for this dataset type.""" 

198 return self.dataset_type.storageClass_name 

199 

200 @property 

201 def storage_class(self) -> StorageClass: 

202 """Storage class for this dataset type.""" 

203 return self.dataset_type.storageClass 

204 

205 @property 

206 def is_calibration(self) -> bool: 

207 """Whether this dataset type can be included in 

208 `~lsst.daf.butler.CollectionType.CALIBRATION` collections. 

209 """ 

210 return self.dataset_type.isCalibration() 

211 

212 def __repr__(self) -> str: 

213 return f"{self.name} ({self.storage_class_name}, {self.dimensions})" 

214 

215 def generalize_ref(self, ref: DatasetRef) -> DatasetRef: 

216 """Convert a `~lsst.daf.butler.DatasetRef` with the dataset type 

217 associated with some task to one with the common dataset type defined 

218 by this node. 

219 

220 Parameters 

221 ---------- 

222 ref : `lsst.daf.butler.DatasetRef` 

223 Reference whose dataset type is convertible to this node's, either 

224 because it is a component with the node's dataset type as its 

225 parent, or because it has a compatible storage class. 

226 

227 Returns 

228 ------- 

229 ref : `lsst.daf.butler.DatasetRef` 

230 Reference with exactly this node's dataset type. 

231 """ 

232 if ref.isComponent(): 

233 ref = ref.makeCompositeRef() 

234 if ref.datasetType.storageClass_name != self.dataset_type.storageClass_name: 

235 return ref.overrideStorageClass(self.dataset_type.storageClass_name) 

236 return ref 

237 

238 def _to_xgraph_state(self) -> dict[str, Any]: 

239 """Convert this node's attributes into a dictionary suitable for use 

240 in exported networkx graphs. 

241 """ 

242 return { 

243 "dataset_type": self.dataset_type, 

244 "is_initial_query_constraint": self.is_initial_query_constraint, 

245 "is_prerequisite": self.is_prerequisite, 

246 "dimensions": self.dataset_type.dimensions, 

247 "storage_class_name": self.dataset_type.storageClass_name, 

248 "bipartite": NodeType.DATASET_TYPE.bipartite, 

249 }