Coverage for python/lsst/pipe/base/pipeline_graph/_dataset_types.py: 50%
76 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-26 02:50 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-26 02:50 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("DatasetTypeNode",)
31import dataclasses
32from collections.abc import Callable, Collection
33from typing import TYPE_CHECKING, Any
35import networkx
36from lsst.daf.butler import DatasetRef, DatasetType, DimensionGroup, DimensionUniverse, StorageClass
38from ._exceptions import DuplicateOutputError
39from ._nodes import NodeKey, NodeType
41if TYPE_CHECKING:
42 from ._edges import ReadEdge, WriteEdge
45@dataclasses.dataclass(frozen=True, eq=False)
46class DatasetTypeNode:
47 """A node in a pipeline graph that represents a resolved dataset type.
49 Notes
50 -----
51 A dataset type node represents a common definition of the dataset type
52 across the entire graph - it is never a component, and the storage class is
53 the registry dataset type's storage class or (if there isn't one) the one
54 defined by the producing task.
56 Dataset type nodes are intentionally not equality comparable, since there
57 are many different (and useful) ways to compare these objects with no clear
58 winner as the most obvious behavior.
59 """
61 dataset_type: DatasetType
62 """Common definition of this dataset type for the graph.
63 """
65 is_initial_query_constraint: bool
66 """Whether this dataset should be included as a constraint in the initial
67 query for data IDs in QuantumGraph generation.
69 This is only `True` for dataset types that are overall regular inputs, and
70 only if none of those input connections had ``deferQueryConstraint=True``.
71 """
73 is_prerequisite: bool
74 """Whether this dataset type is a prerequisite input that must exist in
75 the Registry before graph creation.
76 """
78 producing_edge: WriteEdge | None
79 """The edge to the task that produces this dataset type."""
81 consuming_edges: Collection[ReadEdge]
82 """The edges to tasks that consume this dataset type."""
84 @classmethod
85 def _from_edges(
86 cls,
87 key: NodeKey,
88 xgraph: networkx.MultiDiGraph,
89 get_registered: Callable[[str], DatasetType | None],
90 dimensions: DimensionUniverse,
91 previous: DatasetTypeNode | None,
92 ) -> DatasetTypeNode:
93 """Construct a dataset type node from its edges.
95 Parameters
96 ----------
97 key : `NodeKey`
98 Named tuple that holds the dataset type and serves as the node
99 object in the internal networkx graph.
100 xgraph : `networkx.MultiDiGraph`
101 The internal networkx graph.
102 get_registered : `~collections.abc.Callable`
103 Callable that takes a dataset type name and returns the
104 `DatasetType` registered in the data repository, or `None` if it is
105 not registered.
106 dimensions : `lsst.daf.butler.DimensionUniverse`
107 Definitions of all dimensions.
108 previous : `DatasetTypeNode` or `None`
109 Previous node for this dataset type.
111 Returns
112 -------
113 node : `DatasetTypeNode`
114 Node consistent with all edges pointing to it and the data
115 repository.
116 """
117 dataset_type = get_registered(key.name)
118 is_registered = dataset_type is not None
119 if previous is not None and previous.dataset_type == dataset_type:
120 # This node was already resolved (with exactly the same edges
121 # contributing, since we clear resolutions when edges are added or
122 # removed). The only thing that might have changed was the
123 # definition in the registry, and it didn't.
124 return previous
125 is_initial_query_constraint = True
126 is_prerequisite: bool | None = None
127 producer: str | None = None
128 producing_edge: WriteEdge | None = None
129 # Iterate over the incoming edges to this node, which represent the
130 # output connections of tasks that write this dataset type; these take
131 # precedence over the inputs in determining the graph-wide dataset type
132 # definition (and hence which storage class we register when using the
133 # graph to register dataset types). There should only be one such
134 # connection, but we won't necessarily have checked that rule until
135 # here. As a result there can be at most one iteration of this loop.
136 for _, _, producing_edge in xgraph.in_edges(key, data="instance"):
137 assert producing_edge is not None, "Should only be None if we never loop."
138 if producer is not None:
139 raise DuplicateOutputError(
140 f"Dataset type {key.name!r} is produced by both {producing_edge.task_label!r} "
141 f"and {producer!r}."
142 )
143 producer = producing_edge.task_label
144 dataset_type = producing_edge._resolve_dataset_type(dataset_type, universe=dimensions)
145 is_prerequisite = False
146 is_initial_query_constraint = False
147 consuming_edge: ReadEdge
148 consumers: list[str] = []
149 consuming_edges = list(
150 consuming_edge for _, _, consuming_edge in xgraph.out_edges(key, data="instance")
151 )
152 # Put edges that are not component datasets before any edges that are.
153 consuming_edges.sort(key=lambda consuming_edge: consuming_edge.component is not None)
154 for consuming_edge in consuming_edges:
155 dataset_type, is_initial_query_constraint, is_prerequisite = consuming_edge._resolve_dataset_type(
156 current=dataset_type,
157 universe=dimensions,
158 is_initial_query_constraint=is_initial_query_constraint,
159 is_prerequisite=is_prerequisite,
160 is_registered=is_registered,
161 producer=producer,
162 consumers=consumers,
163 )
164 consumers.append(consuming_edge.task_label)
165 assert dataset_type is not None, "Graph structure guarantees at least one edge."
166 assert is_prerequisite is not None, "Having at least one edge guarantees is_prerequisite is known."
167 return DatasetTypeNode(
168 dataset_type=dataset_type,
169 is_initial_query_constraint=is_initial_query_constraint,
170 is_prerequisite=is_prerequisite,
171 producing_edge=producing_edge,
172 consuming_edges=tuple(consuming_edges),
173 )
175 @property
176 def name(self) -> str:
177 """Name of the dataset type.
179 This is always the parent dataset type, never that of a component.
180 """
181 return self.dataset_type.name
183 @property
184 def key(self) -> NodeKey:
185 """Key that identifies this dataset type in internal and exported
186 networkx graphs.
187 """
188 return NodeKey(NodeType.DATASET_TYPE, self.dataset_type.name)
190 @property
191 def dimensions(self) -> DimensionGroup:
192 """Dimensions of the dataset type."""
193 return self.dataset_type.dimensions.as_group()
195 @property
196 def storage_class_name(self) -> str:
197 """String name of the storage class for this dataset type."""
198 return self.dataset_type.storageClass_name
200 @property
201 def storage_class(self) -> StorageClass:
202 """Storage class for this dataset type."""
203 return self.dataset_type.storageClass
205 @property
206 def is_calibration(self) -> bool:
207 """Whether this dataset type can be included in
208 `~lsst.daf.butler.CollectionType.CALIBRATION` collections.
209 """
210 return self.dataset_type.isCalibration()
212 def __repr__(self) -> str:
213 return f"{self.name} ({self.storage_class_name}, {self.dimensions})"
215 def generalize_ref(self, ref: DatasetRef) -> DatasetRef:
216 """Convert a `~lsst.daf.butler.DatasetRef` with the dataset type
217 associated with some task to one with the common dataset type defined
218 by this node.
220 Parameters
221 ----------
222 ref : `lsst.daf.butler.DatasetRef`
223 Reference whose dataset type is convertible to this node's, either
224 because it is a component with the node's dataset type as its
225 parent, or because it has a compatible storage class.
227 Returns
228 -------
229 ref : `lsst.daf.butler.DatasetRef`
230 Reference with exactly this node's dataset type.
231 """
232 if ref.isComponent():
233 ref = ref.makeCompositeRef()
234 if ref.datasetType.storageClass_name != self.dataset_type.storageClass_name:
235 return ref.overrideStorageClass(self.dataset_type.storageClass_name)
236 return ref
238 def _to_xgraph_state(self) -> dict[str, Any]:
239 """Convert this node's attributes into a dictionary suitable for use
240 in exported networkx graphs.
241 """
242 return {
243 "dataset_type": self.dataset_type,
244 "is_initial_query_constraint": self.is_initial_query_constraint,
245 "is_prerequisite": self.is_prerequisite,
246 "dimensions": self.dataset_type.dimensions,
247 "storage_class_name": self.dataset_type.storageClass_name,
248 "bipartite": NodeType.DATASET_TYPE.bipartite,
249 }