Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 19%
408 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-02 03:31 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-02 03:31 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("PipelineGraph",)
31import gzip
32import itertools
33import json
34from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence, Set
35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast
37import networkx
38import networkx.algorithms.bipartite
39import networkx.algorithms.dag
40from lsst.daf.butler import DataCoordinate, DataId, DatasetType, DimensionGroup, DimensionUniverse, Registry
41from lsst.daf.butler.registry import MissingDatasetTypeError
42from lsst.resources import ResourcePath, ResourcePathExpression
44from ._dataset_types import DatasetTypeNode
45from ._edges import Edge, ReadEdge, WriteEdge
46from ._exceptions import (
47 DuplicateOutputError,
48 EdgesChangedError,
49 PipelineDataCycleError,
50 PipelineGraphError,
51 PipelineGraphExceptionSafetyError,
52 UnresolvedGraphError,
53)
54from ._mapping_views import DatasetTypeMappingView, TaskMappingView
55from ._nodes import NodeKey, NodeType
56from ._task_subsets import TaskSubset
57from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData
59if TYPE_CHECKING:
60 from ..config import PipelineTaskConfig
61 from ..connections import PipelineTaskConnections
62 from ..pipeline import TaskDef
63 from ..pipelineTask import PipelineTask
66_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph)
69class PipelineGraph:
70 """A graph representation of fully-configured pipeline.
72 `PipelineGraph` instances are typically constructed by calling
73 `.Pipeline.to_graph`, but in rare cases constructing and then populating an
74 empty one may be preferable.
76 Parameters
77 ----------
78 description : `str`, optional
79 String description for this pipeline.
80 universe : `lsst.daf.butler.DimensionUniverse`, optional
81 Definitions for all butler dimensions. If not provided, some
82 attributes will not be available until `resolve` is called.
83 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional
84 Data ID that represents a constraint on all quanta generated by this
85 pipeline. This typically just holds the instrument constraint included
86 in the pipeline definition, if there was one.
87 """
89 ###########################################################################
90 #
91 # Simple Pipeline Graph Inspection Interface:
92 #
93 # - for inspecting graph structure, not modifying it (except to sort and]
94 # resolve);
95 #
96 # - no NodeKey objects, just string dataset type name and task label keys;
97 #
98 # - graph structure is represented as a pair of mappings, with methods to
99 # find neighbors and edges of nodes.
100 #
101 ###########################################################################
103 def __init__(
104 self,
105 *,
106 description: str = "",
107 universe: DimensionUniverse | None = None,
108 data_id: DataId | None = None,
109 ) -> None:
110 self._init_from_args(
111 xgraph=None,
112 sorted_keys=None,
113 task_subsets=None,
114 description=description,
115 universe=universe,
116 data_id=data_id,
117 )
119 def __repr__(self) -> str:
120 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})"
122 @property
123 def description(self) -> str:
124 """String description for this pipeline."""
125 return self._description
127 @description.setter
128 def description(self, value: str) -> None:
129 # Docstring in setter.
130 self._description = value
132 @property
133 def universe(self) -> DimensionUniverse | None:
134 """Definitions for all butler dimensions."""
135 return self._universe
137 @property
138 def data_id(self) -> DataCoordinate:
139 """Data ID that represents a constraint on all quanta generated from
140 this pipeline.
142 This is may not be available unless `universe` is not `None`.
143 """
144 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe)
146 @property
147 def tasks(self) -> TaskMappingView:
148 """A mapping view of the tasks in the graph.
150 This mapping has `str` task label keys and `TaskNode` values. Iteration
151 is topologically and deterministically ordered if and only if `sort`
152 has been called since the last modification to the graph.
153 """
154 return self._tasks
156 @property
157 def dataset_types(self) -> DatasetTypeMappingView:
158 """A mapping view of the dataset types in the graph.
160 This mapping has `str` parent dataset type name keys, but only provides
161 access to its `DatasetTypeNode` values if `resolve` has been called
162 since the last modification involving a task that uses a dataset type.
163 See `DatasetTypeMappingView` for details.
164 """
165 return self._dataset_types
167 @property
168 def task_subsets(self) -> Mapping[str, TaskSubset]:
169 """A mapping of all labeled subsets of tasks.
171 Keys are subset labels, values are sets of task labels. See
172 `TaskSubset` for more information.
174 Use `add_task_subset` to add a new subset. The subsets themselves may
175 be modified in-place.
176 """
177 return self._task_subsets
179 @property
180 def is_fully_resolved(self) -> bool:
181 """Whether all of this graph's nodes are resolved."""
182 return self._universe is not None and all(
183 self.dataset_types.is_resolved(k) for k in self.dataset_types
184 )
186 @property
187 def is_sorted(self) -> bool:
188 """Whether this graph's tasks and dataset types are topologically
189 sorted with the exact same deterministic tiebreakers that `sort` would
190 apply.
192 This may perform (and then discard) a full sort if `has_been_sorted` is
193 `False`. If the goal is to obtain a sorted graph, it is better to just
194 call `sort` without guarding that with an ``if not graph.is_sorted``
195 check.
196 """
197 if self._sorted_keys is not None:
198 return True
199 return all(
200 sorted == unsorted
201 for sorted, unsorted in zip(
202 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True
203 )
204 )
206 @property
207 def has_been_sorted(self) -> bool:
208 """Whether this graph's tasks and dataset types have been
209 topologically sorted (with unspecified but deterministic tiebreakers)
210 since the last modification to the graph.
212 This may return `False` if the graph *happens* to be sorted but `sort`
213 was never called, but it is potentially much faster than `is_sorted`,
214 which may attempt (and then discard) a full sort if `has_been_sorted`
215 is `False`.
216 """
217 return self._sorted_keys is not None
219 def sort(self) -> None:
220 """Sort this graph's nodes topologically with deterministic (but
221 unspecified) tiebreakers.
223 This does nothing if the graph is already known to be sorted.
224 """
225 if self._sorted_keys is None:
226 try:
227 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph))
228 except networkx.NetworkXUnfeasible as err: # pragma: no cover
229 # Should't be possible to get here, because we check for cycles
230 # when adding tasks, but we guard against it anyway.
231 cycle = networkx.find_cycle(self._xgraph)
232 raise PipelineDataCycleError(
233 f"Cycle detected while attempting to sort graph: {cycle}."
234 ) from err
235 self._reorder(sorted_keys)
237 def copy(self) -> PipelineGraph:
238 """Return a copy of this graph that copies all mutable state."""
239 xgraph = self._xgraph.copy()
240 result = PipelineGraph.__new__(PipelineGraph)
241 result._init_from_args(
242 xgraph,
243 self._sorted_keys,
244 task_subsets={
245 k: TaskSubset(xgraph, v.label, set(v._members), v.description)
246 for k, v in self._task_subsets.items()
247 },
248 description=self._description,
249 universe=self.universe,
250 data_id=self._raw_data_id,
251 )
252 return result
254 def __copy__(self) -> PipelineGraph:
255 # Fully shallow copies are dangerous; we don't want shared mutable
256 # state to lead to broken class invariants.
257 return self.copy()
259 def __deepcopy__(self, memo: dict) -> PipelineGraph:
260 # Genuine deep copies are unnecessary, since we should only ever care
261 # that mutable state is copied.
262 return self.copy()
264 def diff_tasks(self, other: PipelineGraph) -> list[str]:
265 """Compare two pipeline graphs.
267 This only compares graph structure and task classes (including their
268 edges). It does *not* compare full configuration (which is subject to
269 spurious differences due to import-cache state), dataset type
270 resolutions, or sort state.
272 Parameters
273 ----------
274 other : `PipelineGraph`
275 Graph to compare to.
277 Returns
278 -------
279 differences : `list` [ `str` ]
280 List of string messages describing differences between the
281 pipelines. If empty, the graphs have the same tasks and
282 connections.
283 """
284 messages: list[str] = []
285 common_labels: Set[str]
286 if self.tasks.keys() != other.tasks.keys():
287 common_labels = self.tasks.keys() & other.tasks.keys()
288 messages.append(
289 f"Pipelines have different tasks: A & ~B = {list(self.tasks.keys() - common_labels)}, "
290 f"B & ~A = {list(other.tasks.keys() - common_labels)}."
291 )
292 else:
293 common_labels = self.tasks.keys()
294 for label in common_labels:
295 a = self.tasks[label]
296 b = other.tasks[label]
297 if a.task_class != b.task_class:
298 messages.append(
299 f"Task {label!r} has class {a.task_class_name} in A, " f"but {b.task_class_name} in B."
300 )
301 messages.extend(a.diff_edges(b))
302 return messages
304 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None:
305 """Return the `WriteEdge` that links the producing task to the named
306 dataset type.
308 Parameters
309 ----------
310 dataset_type_name : `str`
311 Dataset type name. Must not be a component.
313 Returns
314 -------
315 edge : `WriteEdge` or `None`
316 Producing edge or `None` if there isn't one in this graph.
318 Raises
319 ------
320 DuplicateOutputError
321 Raised if there are multiple tasks defined to produce this dataset
322 type. This is only possible if the graph's dataset types are not
323 resolved.
325 Notes
326 -----
327 On resolved graphs, it may be slightly more efficient to use::
329 graph.dataset_types[dataset_type_name].producing_edge
331 but this method works on graphs with unresolved dataset types as well.
332 """
333 producer: str | None = None
334 producing_edge: WriteEdge | None = None
335 for _, _, producing_edge in self._xgraph.in_edges(
336 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
337 ):
338 assert producing_edge is not None, "Should only be None if we never loop."
339 if producer is not None:
340 raise DuplicateOutputError(
341 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} "
342 f"and {producer!r}."
343 )
344 return producing_edge
346 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]:
347 """Return the `ReadEdge` objects that link the named dataset type to
348 the tasks that consume it.
350 Parameters
351 ----------
352 dataset_type_name : `str`
353 Dataset type name. Must not be a component.
355 Returns
356 -------
357 edges : `list` [ `ReadEdge` ]
358 Edges that connect this dataset type to the tasks that consume it.
360 Notes
361 -----
362 On resolved graphs, it may be slightly more efficient to use::
364 graph.dataset_types[dataset_type_name].producing_edges
366 but this method works on graphs with unresolved dataset types as well.
367 """
368 return [
369 edge
370 for _, _, edge in self._xgraph.out_edges(
371 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
372 )
373 ]
375 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None:
376 """Return the `TaskNode` or `TaskInitNode` that writes the given
377 dataset type.
379 Parameters
380 ----------
381 dataset_type_name : `str`
382 Dataset type name. Must not be a component.
384 Returns
385 -------
386 edge : `TaskNode`, `TaskInitNode`, or `None`
387 Producing node or `None` if there isn't one in this graph.
389 Raises
390 ------
391 DuplicateOutputError
392 Raised if there are multiple tasks defined to produce this dataset
393 type. This is only possible if the graph's dataset types are not
394 resolved.
395 """
396 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None:
397 return self._xgraph.nodes[producing_edge.task_key]["instance"]
398 return None
400 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]:
401 """Return the `TaskNode` and/or `TaskInitNode` objects that read
402 the given dataset type.
404 Parameters
405 ----------
406 dataset_type_name : `str`
407 Dataset type name. Must not be a component.
409 Returns
410 -------
411 edges : `list` [ `ReadEdge` ]
412 Edges that connect this dataset type to the tasks that consume it.
414 Notes
415 -----
416 On resolved graphs, it may be slightly more efficient to use::
418 graph.dataset_types[dataset_type_name].producing_edges
420 but this method works on graphs with unresolved dataset types as well.
421 """
422 return [
423 self._xgraph.nodes[consuming_edge.task_key]["instance"]
424 for consuming_edge in self.consuming_edges_of(dataset_type_name)
425 ]
427 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]:
428 """Return the dataset types that are inputs to a task.
430 Parameters
431 ----------
432 task_label : `str`
433 Label for the task in the pipeline.
434 init : `bool`, optional
435 If `True`, return init-input dataset types instead of runtime
436 (including prerequisite) inputs.
438 Returns
439 -------
440 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
441 Dictionary parent dataset type name keys and either
442 `DatasetTypeNode` values (if the dataset type has been resolved)
443 or `None` values.
445 Notes
446 -----
447 To get the input edges of a task or task init node (which provide
448 information about storage class overrides nd components) use::
450 graph.tasks[task_label].iter_all_inputs()
452 or
454 graph.tasks[task_label].init.iter_all_inputs()
456 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
457 class.
458 """
459 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
460 return {
461 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
462 for edge in node.iter_all_inputs()
463 }
465 def outputs_of(
466 self, task_label: str, init: bool = False, include_automatic_connections: bool = True
467 ) -> dict[str, DatasetTypeNode | None]:
468 """Return the dataset types that are outputs of a task.
470 Parameters
471 ----------
472 task_label : `str`
473 Label for the task in the pipeline.
474 init : `bool`, optional
475 If `True`, return init-output dataset types instead of runtime
476 outputs.
477 include_automatic_connections : `bool`, optional
478 Whether to include automatic connections such as configs, metadata,
479 and logs.
481 Returns
482 -------
483 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
484 Dictionary parent dataset type name keys and either
485 `DatasetTypeNode` values (if the dataset type has been resolved)
486 or `None` values.
488 Notes
489 -----
490 To get the input edges of a task or task init node (which provide
491 information about storage class overrides nd components) use::
493 graph.tasks[task_label].iter_all_outputs()
495 or
497 graph.tasks[task_label].init.iter_all_outputs()
499 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
500 class.
501 """
502 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
503 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values()
504 return {
505 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
506 for edge in iterable
507 }
509 def resolve(
510 self,
511 registry: Registry | None = None,
512 dimensions: DimensionUniverse | None = None,
513 dataset_types: Mapping[str, DatasetType] | None = None,
514 ) -> None:
515 """Resolve all dimensions and dataset types and check them for
516 consistency.
518 Resolving a graph also causes it to be sorted.
520 Parameters
521 ----------
522 registry : `lsst.daf.butler.Registry`, optional
523 Client for the data repository to resolve against. If not
524 provided, both ``dimensions`` and ``dataset_types`` must be.
525 dimensions : `lsst.daf.butler.DimensionUniverse`, optional
526 Definitions for all dimensions.
527 dataset_types : `~collection.abc.Mapping` [ `str`, \
528 `~lsst.daf.butler.DatasetType` ], optional
529 Mapping of dataset types to consider registered.
531 Notes
532 -----
533 The `universe` attribute is set to ``dimensions`` and used to set all
534 `TaskNode.dimensions` attributes. Dataset type nodes are resolved by
535 first looking for a registry definition, then using the producing
536 task's definition, then looking for consistency between all consuming
537 task definitions.
539 Raises
540 ------
541 ConnectionTypeConsistencyError
542 Raised if a prerequisite input for one task appears as a different
543 kind of connection in any other task.
544 DuplicateOutputError
545 Raised if multiple tasks have the same dataset type as an output.
546 IncompatibleDatasetTypeError
547 Raised if different tasks have different definitions of a dataset
548 type. Different but compatible storage classes are permitted.
549 MissingDatasetTypeError
550 Raised if a dataset type definition is required to exist in the
551 data repository but none was found. This should only occur for
552 dataset types that are not produced by a task in the pipeline and
553 are consumed with different storage classes or as components by
554 tasks in the pipeline.
555 EdgesChangedError
556 Raised if ``check_edges_unchanged=True`` and the edges of a task do
557 change after import and reconfiguration.
558 """
559 if registry is None and (dimensions is None or dataset_types is None):
560 raise PipelineGraphError(
561 "Either 'registry' or both 'dimensions' and 'dataset_types' "
562 "must be passed to PipelineGraph.resolve."
563 )
565 get_registered: Callable[[str], DatasetType | None]
566 if dataset_types is not None:
567 # Ruff seems confused about whether this is used below; it is!
568 get_registered = dataset_types.get
569 else:
570 assert registry is not None
572 def get_registered(name: str) -> DatasetType | None:
573 try:
574 return registry.getDatasetType(name)
575 except MissingDatasetTypeError:
576 return None
578 if dimensions is None:
579 assert registry is not None
580 dimensions = registry.dimensions
582 node_key: NodeKey
583 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {}
584 for node_key, node_state in self._xgraph.nodes.items():
585 match node_key.node_type:
586 case NodeType.TASK:
587 task_node: TaskNode = node_state["instance"]
588 new_task_node = task_node._resolved(dimensions)
589 if new_task_node is not task_node:
590 updates[node_key] = new_task_node
591 case NodeType.DATASET_TYPE:
592 dataset_type_node: DatasetTypeNode | None = node_state["instance"]
593 new_dataset_type_node = DatasetTypeNode._from_edges(
594 node_key, self._xgraph, get_registered, dimensions, previous=dataset_type_node
595 )
596 # Usage of `is`` here is intentional; `_from_edges` returns
597 # `previous=dataset_type_node` if it can determine that it
598 # doesn't need to change.
599 if new_dataset_type_node is not dataset_type_node:
600 updates[node_key] = new_dataset_type_node
601 try:
602 for node_key, node_value in updates.items():
603 self._xgraph.nodes[node_key]["instance"] = node_value
604 except Exception as err: # pragma: no cover
605 # There's no known way to get here, but we want to make it
606 # clear it's a big problem if we do.
607 raise PipelineGraphExceptionSafetyError(
608 "Error during dataset type resolution has left the graph in an inconsistent state."
609 ) from err
610 self.sort()
611 self._universe = dimensions
613 ###########################################################################
614 #
615 # Graph Modification Interface:
616 #
617 # - methods to add, remove, and replace tasks;
618 #
619 # - methods to add and remove task subsets.
620 #
621 # These are all things that are usually done in a Pipeline before making a
622 # graph at all, but there may be cases where we want to modify the graph
623 # instead. (These are also the methods used to make a graph from a
624 # Pipeline, or make a graph from another graph.)
625 #
626 ###########################################################################
628 def add_task(
629 self,
630 label: str | None,
631 task_class: type[PipelineTask],
632 config: PipelineTaskConfig | None = None,
633 connections: PipelineTaskConnections | None = None,
634 ) -> TaskNode:
635 """Add a new task to the graph.
637 Parameters
638 ----------
639 label : `str` or `None`
640 Label for the task in the pipeline. If `None`, `Task._DefaultName`
641 is used.
642 task_class : `type` [ `PipelineTask` ]
643 Class object for the task.
644 config : `PipelineTaskConfig`, optional
645 Configuration for the task. If not provided, a default-constructed
646 instance of ``task_class.ConfigClass`` is used.
647 connections : `PipelineTaskConnections`, optional
648 Object that describes the dataset types used by the task. If not
649 provided, one will be constructed from the given configuration. If
650 provided, it is assumed that ``config`` has already been validated
651 and frozen.
653 Returns
654 -------
655 node : `TaskNode`
656 The new task node added to the graph.
658 Raises
659 ------
660 ValueError
661 Raised if configuration validation failed when constructing
662 ``connections``.
663 PipelineDataCycleError
664 Raised if the graph is cyclic after this addition.
665 RuntimeError
666 Raised if an unexpected exception (which will be chained) occurred
667 at a stage that may have left the graph in an inconsistent state.
668 Other exceptions should leave the graph unchanged.
670 Notes
671 -----
672 Checks for dataset type consistency and multiple producers do not occur
673 until `resolve` is called, since the resolution depends on both the
674 state of the data repository and all contributing tasks.
676 Adding new tasks removes any existing resolutions of all dataset types
677 it references and marks the graph as unsorted. It is most effiecient
678 to add all tasks up front and only then resolve and/or sort the graph.
679 """
680 if label is None:
681 label = task_class._DefaultName
682 if config is None:
683 config = task_class.ConfigClass()
684 task_node = TaskNode._from_imported_data(
685 key=NodeKey(NodeType.TASK, label),
686 init_key=NodeKey(NodeType.TASK_INIT, label),
687 data=_TaskNodeImportedData.configure(label, task_class, config, connections),
688 universe=self.universe,
689 )
690 self.add_task_nodes([task_node])
691 return task_node
693 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None:
694 """Add one or more existing task nodes to the graph.
696 Parameters
697 ----------
698 nodes : `~collections.abc.Iterable` [ `TaskNode` ]
699 Iterable of task nodes to add. If any tasks have resolved
700 dimensions, they must have the same dimension universe as the rest
701 of the graph.
702 parent : `PipelineGraph`, optional
703 If provided, another `PipelineGraph` from which these nodes were
704 obtained. Any dataset type nodes already present in ``parent``
705 that are referenced by the given tasks will be used in this graph
706 if they are not already present, preserving any dataset type
707 resolutions present in the parent graph. Adding nodes from a
708 parent graph after the graph has its own nodes (e.g. from
709 `add_task`) or nodes from a third graph may result in invalid
710 dataset type resolutions. It is safest to only use this argument
711 when populating an empty graph for the first time.
713 Raises
714 ------
715 PipelineDataCycleError
716 Raised if the graph is cyclic after this addition.
718 Notes
719 -----
720 Checks for dataset type consistency and multiple producers do not occur
721 until `resolve` is called, since the resolution depends on both the
722 state of the data repository and all contributing tasks.
724 Adding new tasks removes any existing resolutions of all dataset types
725 it references (unless ``parent is not None`` and marks the graph as
726 unsorted. It is most efficient to add all tasks up front and only then
727 resolve and/or sort the graph.
728 """
729 node_data: list[tuple[NodeKey, dict[str, Any]]] = []
730 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = []
731 for task_node in nodes:
732 task_node = task_node._resolved(self._universe)
733 node_data.append(
734 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite})
735 )
736 node_data.append(
737 (
738 task_node.init.key,
739 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite},
740 )
741 )
742 # Convert the edge objects attached to the task node to networkx.
743 for read_edge in task_node.init.iter_all_inputs():
744 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
745 for write_edge in task_node.init.iter_all_outputs():
746 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
747 for read_edge in task_node.iter_all_inputs():
748 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
749 for write_edge in task_node.iter_all_outputs():
750 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
751 # Add a special edge (with no Edge instance) that connects the
752 # TaskInitNode to the runtime TaskNode.
753 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None}))
754 if not node_data and not edge_data:
755 return
756 # Checks and preparation complete; time to start the actual
757 # modification, during which it's hard to provide strong exception
758 # safety. Start by resetting the sort ordering, if there is one.
759 self._reset()
760 try:
761 self._xgraph.add_nodes_from(node_data)
762 self._xgraph.add_edges_from(edge_data)
763 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph):
764 cycle = networkx.find_cycle(self._xgraph)
765 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.")
766 except Exception:
767 # First try to roll back our changes.
768 try:
769 self._xgraph.remove_edges_from(edge_data)
770 self._xgraph.remove_nodes_from(key for key, _ in node_data)
771 except Exception as err: # pragma: no cover
772 # There's no known way to get here, but we want to make it
773 # clear it's a big problem if we do.
774 raise PipelineGraphExceptionSafetyError(
775 "Error while attempting to revert PipelineGraph modification has left the graph in "
776 "an inconsistent state."
777 ) from err
778 # Successfully rolled back; raise the original exception.
779 raise
781 def reconfigure_tasks(
782 self,
783 *args: tuple[str, PipelineTaskConfig],
784 check_edges_unchanged: bool = False,
785 assume_edges_unchanged: bool = False,
786 **kwargs: PipelineTaskConfig,
787 ) -> None:
788 """Update the configuration for one or more tasks.
790 Parameters
791 ----------
792 *args : `tuple` [ `str`, `.PipelineTaskConfig` ]
793 Positional arguments are each a 2-tuple of task label and new
794 config object. Note that the same arguments may also be passed as
795 ``**kwargs``, which is usually more readable, but task labels in
796 ``*args`` are not required to be valid Python identifiers.
797 check_edges_unchanged : `bool`, optional
798 If `True`, require the edges (connections) of the modified tasks to
799 remain unchanged after the configuration updates, and verify that
800 this is the case.
801 assume_edges_unchanged : `bool`, optional
802 If `True`, the caller declares that the edges (connections) of the
803 modified tasks will remain unchanged after the configuration
804 updates, and that it is unnecessary to check this.
805 **kwargs : `.PipelineTaskConfig`
806 New config objects or overrides to apply to copies of the current
807 config objects, with task labels as the keywords.
809 Raises
810 ------
811 ValueError
812 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
813 are both `True`, or if the same task appears twice.
814 EdgesChangedError
815 Raised if ``check_edges_unchanged=True`` and the edges of a task do
816 change.
818 Notes
819 -----
820 If reconfiguring a task causes its edges to change, any dataset type
821 nodes connected to that task (not just those whose edges have changed!)
822 will be unresolved.
823 """
824 new_configs: dict[str, PipelineTaskConfig] = {}
825 for task_label, config_update in itertools.chain(args, kwargs.items()):
826 if new_configs.setdefault(task_label, config_update) is not config_update:
827 raise ValueError(f"Config for {task_label!r} provided more than once.")
828 updates = {
829 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged)
830 for task_label, config in new_configs.items()
831 }
832 self._replace_task_nodes(
833 updates,
834 check_edges_unchanged=check_edges_unchanged,
835 assume_edges_unchanged=assume_edges_unchanged,
836 message_header=(
837 "Unexpected change in edges for task {task_label!r} from original config (A) to "
838 "new configs (B):"
839 ),
840 )
842 def remove_tasks(
843 self, labels: Iterable[str], drop_from_subsets: bool = True
844 ) -> list[tuple[TaskNode, set[str]]]:
845 """Remove one or more tasks from the graph.
847 Parameters
848 ----------
849 labels : `~collections.abc.Iterable` [ `str` ]
850 Iterable of the labels of the tasks to remove.
851 drop_from_subsets : `bool`, optional
852 If `True`, drop each removed task from any subset in which it
853 currently appears. If `False`, raise `PipelineGraphError` if any
854 such subsets exist.
856 Returns
857 -------
858 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ]
859 List of nodes removed and the labels of task subsets that
860 referenced them.
862 Raises
863 ------
864 PipelineGraphError
865 Raised if ``drop_from_subsets`` is `False` and the task is still
866 part of one or more subsets.
868 Notes
869 -----
870 Removing a task will cause dataset nodes with no other referencing
871 tasks to be removed. Any other dataset type nodes referenced by a
872 removed task will be reset to an "unresolved" state.
873 """
874 task_nodes_and_subsets = []
875 dataset_types: set[NodeKey] = set()
876 nodes_to_remove = set()
877 for label in labels:
878 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"]
879 # Find task subsets that reference this task.
880 referencing_subsets = {
881 subset_label
882 for subset_label, task_subset in self.task_subsets.items()
883 if label in task_subset
884 }
885 if not drop_from_subsets and referencing_subsets:
886 raise PipelineGraphError(
887 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}."
888 )
889 task_nodes_and_subsets.append((task_node, referencing_subsets))
890 # Find dataset types referenced by this task.
891 dataset_types.update(self._xgraph.predecessors(task_node.key))
892 dataset_types.update(self._xgraph.successors(task_node.key))
893 dataset_types.update(self._xgraph.predecessors(task_node.init.key))
894 dataset_types.update(self._xgraph.successors(task_node.init.key))
895 # Since there's an edge between the task and its init node, we'll
896 # have added those two nodes here, too, and we don't want that.
897 dataset_types.remove(task_node.init.key)
898 dataset_types.remove(task_node.key)
899 # Mark the task node and its init node for removal from the graph.
900 nodes_to_remove.add(task_node.key)
901 nodes_to_remove.add(task_node.init.key)
902 # Process the referenced datasets to see which ones are orphaned and
903 # need to be removed vs. just unresolved.
904 nodes_to_unresolve = []
905 for dataset_type_key in dataset_types:
906 related_tasks = set()
907 related_tasks.update(self._xgraph.predecessors(dataset_type_key))
908 related_tasks.update(self._xgraph.successors(dataset_type_key))
909 related_tasks.difference_update(nodes_to_remove)
910 if not related_tasks:
911 nodes_to_remove.add(dataset_type_key)
912 else:
913 nodes_to_unresolve.append(dataset_type_key)
914 # Checks and preparation complete; time to start the actual
915 # modification, during which it's hard to provide strong exception
916 # safety. Start by resetting the sort ordering.
917 self._reset()
918 try:
919 for dataset_type_key in nodes_to_unresolve:
920 self._xgraph.nodes[dataset_type_key]["instance"] = None
921 for task_node, referencing_subsets in task_nodes_and_subsets:
922 for subset_label in referencing_subsets:
923 self._task_subsets[subset_label].remove(task_node.label)
924 self._xgraph.remove_nodes_from(nodes_to_remove)
925 except Exception as err: # pragma: no cover
926 # There's no known way to get here, but we want to make it
927 # clear it's a big problem if we do.
928 raise PipelineGraphExceptionSafetyError(
929 "Error during task removal has left the graph in an inconsistent state."
930 ) from err
931 return task_nodes_and_subsets
933 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None:
934 """Add a label for a set of tasks that are already in the pipeline.
936 Parameters
937 ----------
938 subset_label : `str`
939 Label for this set of tasks.
940 task_labels : `~collections.abc.Iterable` [ `str` ]
941 Labels of the tasks to include in the set. All must already be
942 included in the graph.
943 description : `str`, optional
944 String description to associate with this label.
945 """
946 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description)
947 self._task_subsets[subset_label] = subset
949 def remove_task_subset(self, subset_label: str) -> None:
950 """Remove a labeled set of tasks.
952 Parameters
953 ----------
954 subset_label : `str`
955 Label for this set of tasks.
956 """
957 del self._task_subsets[subset_label]
959 ###########################################################################
960 #
961 # NetworkX Export Interface:
962 #
963 # - methods to export the PipelineGraph's content (or various subsets
964 # thereof) as NetworkX objects.
965 #
966 # These are particularly useful when writing tools to visualize the graph,
967 # while providing options for which aspects of the graph (tasks, dataset
968 # types, or both) to include, since all exported graphs have similar
969 # attributes regardless of their structure.
970 #
971 ###########################################################################
973 def make_xgraph(self) -> networkx.MultiDiGraph:
974 """Export a networkx representation of the full pipeline graph,
975 including both init and runtime edges.
977 Returns
978 -------
979 xgraph : `networkx.MultiDiGraph`
980 Directed acyclic graph with parallel edges.
982 Notes
983 -----
984 The returned graph uses `NodeKey` instances for nodes. Parallel edges
985 represent the same dataset type appearing in multiple connections for
986 the same task, and are hence rare. The connection name is used as the
987 edge key to disambiguate those parallel edges.
989 Almost all edges connect dataset type nodes to task or task init nodes
990 or vice versa, but there is also a special edge that connects each task
991 init node to its runtime node. The existence of these edges makes the
992 graph not quite bipartite, though its init-only and runtime-only
993 subgraphs are bipartite.
995 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
996 `WriteEdge` for the descriptive node and edge attributes added.
997 """
998 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False)
1000 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph:
1001 """Return a bipartite networkx representation of just the runtime or
1002 init-time pipeline graph.
1004 Parameters
1005 ----------
1006 init : `bool`, optional
1007 If `True` (`False` is default) return the graph of task
1008 initialization nodes and init input/output dataset types, instead
1009 of the graph of runtime task nodes and regular
1010 input/output/prerequisite dataset types.
1012 Returns
1013 -------
1014 xgraph : `networkx.MultiDiGraph`
1015 Directed acyclic graph with parallel edges.
1017 Notes
1018 -----
1019 The returned graph uses `NodeKey` instances for nodes. Parallel edges
1020 represent the same dataset type appearing in multiple connections for
1021 the same task, and are hence rare. The connection name is used as the
1022 edge key to disambiguate those parallel edges.
1024 This graph is bipartite because each dataset type node only has edges
1025 that connect it to a task [init] node, and vice versa.
1027 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
1028 `WriteEdge` for the descriptive node and edge attributes added.
1029 """
1030 return self._transform_xgraph_state(
1031 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False
1032 )
1034 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph:
1035 """Return a networkx representation of just the tasks in the pipeline.
1037 Parameters
1038 ----------
1039 init : `bool`, optional
1040 If `True` (`False` is default) return the graph of task
1041 initialization nodes, instead of the graph of runtime task nodes.
1043 Returns
1044 -------
1045 xgraph : `networkx.DiGraph`
1046 Directed acyclic graph with no parallel edges.
1048 Notes
1049 -----
1050 The returned graph uses `NodeKey` instances for nodes. The dataset
1051 types that link these tasks are not represented at all; edges have no
1052 attributes, and there are no parallel edges.
1054 See `TaskNode` and `TaskInitNode` for the descriptive node and
1055 attributes added.
1056 """
1057 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
1058 task_keys = [
1059 key
1060 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1061 if bipartite == NodeType.TASK.bipartite
1062 ]
1063 return self._transform_xgraph_state(
1064 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys),
1065 skip_edges=True,
1066 )
1068 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph:
1069 """Return a networkx representation of just the dataset types in the
1070 pipeline.
1072 Parameters
1073 ----------
1074 init : `bool`, optional
1075 If `True` (`False` is default) return the graph of init input and
1076 output dataset types, instead of the graph of runtime (input,
1077 output, prerequisite input) dataset types.
1079 Returns
1080 -------
1081 xgraph : `networkx.DiGraph`
1082 Directed acyclic graph with no parallel edges.
1084 Notes
1085 -----
1086 The returned graph uses `NodeKey` instances for nodes. The tasks that
1087 link these tasks are not represented at all; edges have no attributes,
1088 and there are no parallel edges.
1090 See `DatasetTypeNode` for the descriptive node and attributes added.
1091 """
1092 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
1093 dataset_type_keys = [
1094 key
1095 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1096 if bipartite == NodeType.DATASET_TYPE.bipartite
1097 ]
1098 return self._transform_xgraph_state(
1099 networkx.algorithms.bipartite.projected_graph(
1100 networkx.DiGraph(bipartite_xgraph), dataset_type_keys
1101 ),
1102 skip_edges=True,
1103 )
1105 ###########################################################################
1106 #
1107 # Serialization Interface.
1108 #
1109 # Serialization of PipelineGraphs is currently experimental and may not be
1110 # retained in the future. All serialization methods are
1111 # underscore-prefixed to ensure nobody mistakes them for a stable interface
1112 # (let a lone a stable file format).
1113 #
1114 ###########################################################################
1116 @classmethod
1117 def _read_stream(
1118 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1119 ) -> PipelineGraph:
1120 """Read a serialized `PipelineGraph` from a file-like object.
1122 Parameters
1123 ----------
1124 stream : `BinaryIO`
1125 File-like object opened for binary reading, containing
1126 gzip-compressed JSON.
1127 import_mode : `TaskImportMode`, optional
1128 Whether to import tasks, and how to reconcile any differences
1129 between the imported task's connections and the those that were
1130 persisted with the graph. Default is to check that they are the
1131 same.
1133 Returns
1134 -------
1135 graph : `PipelineGraph`
1136 Deserialized pipeline graph.
1138 Raises
1139 ------
1140 PipelineGraphReadError
1141 Raised if the serialized `PipelineGraph` is not self-consistent.
1142 EdgesChangedError
1143 Raised if ``import_mode`` is
1144 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1145 did change after import and reconfiguration.
1147 Notes
1148 -----
1149 `PipelineGraph` serialization is currently experimental and may be
1150 removed or significantly changed in the future, with no deprecation
1151 period.
1152 """
1153 from .io import SerializedPipelineGraph
1155 with gzip.open(stream, "rb") as uncompressed_stream:
1156 data = json.load(uncompressed_stream)
1157 serialized_graph = SerializedPipelineGraph.model_validate(data)
1158 return serialized_graph.deserialize(import_mode)
1160 @classmethod
1161 def _read_uri(
1162 cls,
1163 uri: ResourcePathExpression,
1164 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES,
1165 ) -> PipelineGraph:
1166 """Read a serialized `PipelineGraph` from a file at a URI.
1168 Parameters
1169 ----------
1170 uri : convertible to `lsst.resources.ResourcePath`
1171 URI to a gzip-compressed JSON file containing a serialized pipeline
1172 graph.
1173 import_mode : `TaskImportMode`, optional
1174 Whether to import tasks, and how to reconcile any differences
1175 between the imported task's connections and the those that were
1176 persisted with the graph. Default is to check that they are the
1177 same.
1179 Returns
1180 -------
1181 graph : `PipelineGraph`
1182 Deserialized pipeline graph.
1184 Raises
1185 ------
1186 PipelineGraphReadError
1187 Raised if the serialized `PipelineGraph` is not self-consistent.
1188 EdgesChangedError
1189 Raised if ``import_mode`` is
1190 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1191 did change after import and reconfiguration.
1193 Notes
1194 -----
1195 `PipelineGraph` serialization is currently experimental and may be
1196 removed or significantly changed in the future, with no deprecation
1197 period.
1198 """
1199 uri = ResourcePath(uri)
1200 with uri.open("rb") as stream:
1201 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode)
1203 def _write_stream(self, stream: BinaryIO) -> None:
1204 """Write the pipeline to a file-like object.
1206 Parameters
1207 ----------
1208 stream
1209 File-like object opened for binary writing.
1211 Notes
1212 -----
1213 `PipelineGraph` serialization is currently experimental and may be
1214 removed or significantly changed in the future, with no deprecation
1215 period.
1217 The file format is gzipped JSON, and is intended to be human-readable,
1218 but it should not be considered a stable public interface for outside
1219 code, which should always use `PipelineGraph` methods (or at least the
1220 `io.SerializedPipelineGraph` class) to read these files.
1221 """
1222 from .io import SerializedPipelineGraph
1224 with gzip.open(stream, mode="wb") as compressed_stream:
1225 compressed_stream.write(
1226 SerializedPipelineGraph.serialize(self).model_dump_json(exclude_defaults=True).encode("utf-8")
1227 )
1229 def _write_uri(self, uri: ResourcePathExpression) -> None:
1230 """Write the pipeline to a file given a URI.
1232 Parameters
1233 ----------
1234 uri : convertible to `lsst.resources.ResourcePath`
1235 URI to write to . May have ``.json.gz`` or no extension (which
1236 will cause a ``.json.gz`` extension to be added).
1238 Notes
1239 -----
1240 `PipelineGraph` serialization is currently experimental and may be
1241 removed or significantly changed in the future, with no deprecation
1242 period.
1244 The file format is gzipped JSON, and is intended to be human-readable,
1245 but it should not be considered a stable public interface for outside
1246 code, which should always use `PipelineGraph` methods (or at least the
1247 `io.SerializedPipelineGraph` class) to read these files.
1248 """
1249 uri = ResourcePath(uri)
1250 extension = uri.getExtension()
1251 if not extension:
1252 uri = uri.updatedExtension(".json.gz")
1253 elif extension != ".json.gz":
1254 raise ValueError("Expanded pipeline files should always have a .json.gz extension.")
1255 with uri.open(mode="wb") as stream:
1256 self._write_stream(cast(BinaryIO, stream))
1258 def _import_and_configure(
1259 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1260 ) -> None:
1261 """Import the `PipelineTask` classes referenced by all task nodes and
1262 update those nodes accordingly.
1264 Parameters
1265 ----------
1266 import_mode : `TaskImportMode`, optional
1267 Whether to import tasks, and how to reconcile any differences
1268 between the imported task's connections and the those that were
1269 persisted with the graph. Default is to check that they are the
1270 same. This method does nothing if this is
1271 `TaskImportMode.DO_NOT_IMPORT`.
1273 Raises
1274 ------
1275 EdgesChangedError
1276 Raised if ``import_mode`` is
1277 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1278 did change after import and reconfiguration.
1280 Notes
1281 -----
1282 This method shouldn't need to be called unless the graph was
1283 deserialized without importing and configuring immediately, which is
1284 not the default behavior (but it can greatly speed up deserialization).
1285 If all tasks have already been imported this does nothing.
1287 Importing and configuring a task can change its
1288 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output,
1289 usually because the software used to read a serialized graph is newer
1290 than the software used to write it (e.g. a new config option has been
1291 added, or the task was moved to a new module with a forwarding alias
1292 left behind). These changes are allowed by
1293 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`.
1295 If importing and configuring a task causes its edges to change, any
1296 dataset type nodes linked to those edges will be reset to the
1297 unresolved state.
1298 """
1299 if import_mode is TaskImportMode.DO_NOT_IMPORT:
1300 return
1301 rebuild = (
1302 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES
1303 or import_mode is TaskImportMode.OVERRIDE_EDGES
1304 )
1305 updates: dict[str, TaskNode] = {}
1306 node_key: NodeKey
1307 for node_key, node_state in self._xgraph.nodes.items():
1308 if node_key.node_type is NodeType.TASK:
1309 task_node: TaskNode = node_state["instance"]
1310 new_task_node = task_node._imported_and_configured(rebuild)
1311 if new_task_node is not task_node:
1312 updates[task_node.label] = new_task_node
1313 self._replace_task_nodes(
1314 updates,
1315 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES),
1316 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES),
1317 message_header=(
1318 "In task with label {task_label!r}, persisted edges (A)"
1319 "differ from imported and configured edges (B):"
1320 ),
1321 )
1323 ###########################################################################
1324 #
1325 # Advanced PipelineGraph Inspection Interface:
1326 #
1327 # - methods to iterate over all nodes and edges, utilizing NodeKeys;
1328 #
1329 # - methods to find overall inputs and group nodes by their dimensions,
1330 # which are important operations for QuantumGraph generation.
1331 #
1332 ###########################################################################
1334 def iter_edges(self, init: bool = False) -> Iterator[Edge]:
1335 """Iterate over edges in the graph.
1337 Parameters
1338 ----------
1339 init : `bool`, optional
1340 If `True` (`False` is default) iterate over the edges between task
1341 initialization node and init input/output dataset types, instead of
1342 the runtime task nodes and regular input/output/prerequisite
1343 dataset types.
1345 Returns
1346 -------
1347 edges : `~collections.abc.Iterator` [ `Edge` ]
1348 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances.
1350 Notes
1351 -----
1352 This method always returns _either_ init edges or runtime edges, never
1353 both. The full (internal) graph that contains both also includes a
1354 special edge that connects each task init node to its runtime node;
1355 that is also never returned by this method, since it is never a part of
1356 the init-only or runtime-only subgraphs.
1357 """
1358 edge: Edge
1359 for _, _, edge in self._xgraph.edges(data="instance"):
1360 if edge is not None and edge.is_init == init:
1361 yield edge
1363 def iter_nodes(
1364 self,
1365 ) -> Iterator[
1366 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode]
1367 | tuple[Literal[NodeType.TASK], str, TaskInitNode]
1368 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None]
1369 ]:
1370 """Iterate over nodes in the graph.
1372 Returns
1373 -------
1374 nodes : `~collections.abc.Iterator` [ `tuple` ]
1375 A lazy iterator over all of the nodes in the graph. Each yielded
1376 element is a tuple of:
1378 - the node type enum value (`NodeType`);
1379 - the string name for the node (task label or parent dataset type
1380 name);
1381 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`,
1382 or `None` for dataset type nodes that have not been resolved).
1383 """
1384 key: NodeKey
1385 if self._sorted_keys is not None:
1386 for key in self._sorted_keys:
1387 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore
1388 else:
1389 for key, node in self._xgraph.nodes(data="instance"):
1390 yield key.node_type, key.name, node # type: ignore
1392 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]:
1393 """Iterate over all of the dataset types that are consumed but not
1394 produced by the graph.
1396 Returns
1397 -------
1398 dataset_types : `~collections.abc.Iterator` [ `tuple` ]
1399 A lazy iterator over the overall-input dataset types (including
1400 overall init inputs and prerequisites). Each yielded element is a
1401 tuple of:
1403 - the parent dataset type name;
1404 - the resolved `DatasetTypeNode`, or `None` if the dataset type has
1405 - not been resolved.
1406 """
1407 for generation in networkx.algorithms.dag.topological_generations(self._xgraph):
1408 key: NodeKey
1409 for key in generation:
1410 # While we expect all tasks to have at least one input and
1411 # hence never appear in the first topological generation, that
1412 # is not true of task init nodes.
1413 if key.node_type is NodeType.DATASET_TYPE:
1414 yield key.name, self._xgraph.nodes[key]["instance"]
1415 return
1417 def group_by_dimensions(
1418 self, prerequisites: bool = False
1419 ) -> dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]:
1420 """Group this graph's tasks and dataset types by their dimensions.
1422 Parameters
1423 ----------
1424 prerequisites : `bool`, optional
1425 If `True`, include prerequisite dataset types as well as regular
1426 input and output datasets (including intermediates).
1428 Returns
1429 -------
1430 groups : `dict` [ `DimensionGroup`, `tuple` ]
1431 A dictionary of groups keyed by `DimensionGroup`, in which each
1432 value is a tuple of:
1434 - a `dict` of `TaskNode` instances, keyed by task label
1435 - a `dict` of `DatasetTypeNode` instances, keyed by
1436 dataset type name.
1438 that have those dimensions.
1440 Notes
1441 -----
1442 Init inputs and outputs are always included, but always have empty
1443 dimensions and are hence are all grouped together.
1444 """
1445 result: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {}
1446 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {})
1447 for task_label, task_node in self.tasks.items():
1448 if task_node.dimensions is None:
1449 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.")
1450 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value:
1451 next_new_value = ({}, {}) # make new lists for next time
1452 group[0][task_node.label] = task_node
1453 for dataset_type_name, dataset_type_node in self.dataset_types.items():
1454 if dataset_type_node is None:
1455 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.")
1456 if not dataset_type_node.is_prerequisite or prerequisites:
1457 if (
1458 group := result.setdefault(
1459 dataset_type_node.dataset_type.dimensions.as_group(), next_new_value
1460 )
1461 ) is next_new_value:
1462 next_new_value = ({}, {}) # make new lists for next time
1463 group[1][dataset_type_node.name] = dataset_type_node
1464 return result
1466 def split_independent(self) -> Iterable[PipelineGraph]:
1467 """Iterate over independent subgraphs that together comprise this
1468 pipeline graph.
1470 Returns
1471 -------
1472 subgraphs : `Iterable` [ `PipelineGraph` ]
1473 An iterable over component subgraphs that could be run
1474 independently (they have only overall inputs in common). May be a
1475 lazy iterator.
1477 Notes
1478 -----
1479 All resolved dataset type nodes will be preserved.
1481 If there is only one component, ``self`` may be returned as the only
1482 element in the iterable.
1484 If `has_been_sorted`, all subgraphs will be sorted as well.
1485 """
1486 # Having an overall input in common isn't enough to make subgraphs
1487 # dependent on each other, so we want to look for connected component
1488 # subgraphs of the task-only projected graph.
1489 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False)
1490 task_keys = {
1491 key
1492 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1493 if bipartite == NodeType.TASK.bipartite
1494 }
1495 task_xgraph = networkx.algorithms.bipartite.projected_graph(
1496 networkx.DiGraph(bipartite_xgraph), task_keys
1497 )
1498 # "Weakly" connected means connected in only one direction, which is
1499 # the only kind of "connected" a DAG can ever be.
1500 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph):
1501 if component_task_keys == task_keys:
1502 yield self
1503 return
1504 else:
1505 component_subgraph = PipelineGraph(universe=self._universe)
1506 component_subgraph.add_task_nodes(
1507 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self
1508 )
1509 if self.has_been_sorted:
1510 component_subgraph.sort()
1511 yield component_subgraph
1513 ###########################################################################
1514 #
1515 # Class- and Package-Private Methods.
1516 #
1517 ###########################################################################
1519 def _iter_task_defs(self) -> Iterator[TaskDef]:
1520 """Iterate over this pipeline as a sequence of `TaskDef` instances.
1522 Notes
1523 -----
1524 This is a package-private method intended to aid in the transition to a
1525 codebase more fully integrated with the `PipelineGraph` class, in which
1526 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and
1527 much of the functionality on the `Pipeline` class will be moved to
1528 `PipelineGraph` as well.
1530 Raises
1531 ------
1532 TaskNotImportedError
1533 Raised if `TaskNode.is_imported` is `False` for any task.
1534 """
1535 from ..pipeline import TaskDef
1537 for node in self._tasks.values():
1538 yield TaskDef(
1539 config=node.config,
1540 taskClass=node.task_class,
1541 label=node.label,
1542 connections=node.get_connections(),
1543 )
1545 def _init_from_args(
1546 self,
1547 xgraph: networkx.MultiDiGraph | None,
1548 sorted_keys: Sequence[NodeKey] | None,
1549 task_subsets: dict[str, TaskSubset] | None,
1550 description: str,
1551 universe: DimensionUniverse | None,
1552 data_id: DataId | None,
1553 ) -> None:
1554 """Initialize the graph with possibly-nontrivial arguments.
1556 Parameters
1557 ----------
1558 xgraph : `networkx.MultiDiGraph` or `None`
1559 The backing networkx graph, or `None` to create an empty one.
1560 This graph has `NodeKey` instances for nodes and the same structure
1561 as the graph exported by `make_xgraph`, but its nodes and edges
1562 have a single ``instance`` attribute that holds a `TaskNode`,
1563 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or
1564 `WriteEdge` instance.
1565 sorted_keys : `Sequence` [ `NodeKey` ] or `None`
1566 Topologically sorted sequence of node keys, or `None` if the graph
1567 is not sorted.
1568 task_subsets : `dict` [ `str`, `TaskSubset` ]
1569 Labeled subsets of tasks. Values must be constructed with
1570 ``xgraph`` as their parent graph.
1571 description : `str`
1572 String description for this pipeline.
1573 universe : `lsst.daf.butler.DimensionUniverse` or `None`
1574 Definitions of all dimensions.
1575 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping.
1576 Data ID that represents a constraint on all quanta generated from
1577 this pipeline.
1579 Notes
1580 -----
1581 Only empty `PipelineGraph` instances should be constructed directly by
1582 users, which sets the signature of ``__init__`` itself, but methods on
1583 `PipelineGraph` and its helper classes need to be able to create them
1584 with state. Those methods can call this after calling ``__new__``
1585 manually, skipping ``__init__``.
1586 """
1587 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph()
1588 self._sorted_keys: Sequence[NodeKey] | None = None
1589 self._task_subsets = task_subsets if task_subsets is not None else {}
1590 self._description = description
1591 self._tasks = TaskMappingView(self._xgraph)
1592 self._dataset_types = DatasetTypeMappingView(self._xgraph)
1593 self._raw_data_id: dict[str, Any]
1594 if isinstance(data_id, DataCoordinate):
1595 if universe is None:
1596 universe = data_id.universe
1597 else:
1598 assert universe is data_id.universe, "data_id.universe and given universe differ"
1599 self._raw_data_id = dict(data_id.required)
1600 elif data_id is None:
1601 self._raw_data_id = {}
1602 else:
1603 self._raw_data_id = dict(data_id)
1604 self._universe = universe
1605 if sorted_keys is not None:
1606 self._reorder(sorted_keys)
1608 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph:
1609 """Make a bipartite init-only or runtime-only internal subgraph.
1611 See `make_bipartite_xgraph` for parameters and return values.
1613 Notes
1614 -----
1615 This method returns a view of the `PipelineGraph` object's internal
1616 backing graph, and hence should only be called in methods that copy the
1617 result either explicitly or by running a copying algorithm before
1618 returning it to the user.
1619 """
1620 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)])
1622 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G:
1623 """Transform networkx graph attributes in-place from the internal
1624 "instance" attributes to the documented exported attributes.
1626 Parameters
1627 ----------
1628 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1629 Graph whose state should be transformed.
1630 skip_edges : `bool`
1631 If `True`, do not transform edge state.
1633 Returns
1634 -------
1635 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1636 The same object passed in, after modification.
1638 Notes
1639 -----
1640 This should be called after making a copy of the internal graph but
1641 before any projection down to just task or dataset type nodes, since
1642 it assumes stateful edges.
1643 """
1644 state: dict[str, Any]
1645 for state in xgraph.nodes.values():
1646 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance")
1647 if node_value is not None:
1648 state.update(node_value._to_xgraph_state())
1649 else:
1650 # This is a dataset type node that is not resolved.
1651 state["bipartite"] = NodeType.DATASET_TYPE.bipartite
1652 if not skip_edges:
1653 for _, _, state in xgraph.edges(data=True):
1654 edge: Edge | None = state.pop("instance", None)
1655 if edge is not None:
1656 state.update(edge._to_xgraph_state())
1657 return xgraph
1659 def _replace_task_nodes(
1660 self,
1661 updates: Mapping[str, TaskNode],
1662 check_edges_unchanged: bool,
1663 assume_edges_unchanged: bool,
1664 message_header: str,
1665 ) -> None:
1666 """Replace task nodes and update edges and dataset type nodes
1667 accordingly.
1669 Parameters
1670 ----------
1671 updates : `Mapping` [ `str`, `TaskNode` ]
1672 New task nodes with task label keys. All keys must be task labels
1673 that are already present in the graph.
1674 check_edges_unchanged : `bool`, optional
1675 If `True`, require the edges (connections) of the modified tasks to
1676 remain unchanged after importing and configuring each task, and
1677 verify that this is the case.
1678 assume_edges_unchanged : `bool`, optional
1679 If `True`, the caller declares that the edges (connections) of the
1680 modified tasks will remain unchanged importing and configuring each
1681 task, and that it is unnecessary to check this.
1682 message_header : `str`
1683 Template for `str.format` with a single ``task_label`` placeholder
1684 to use as the first line in `EdgesChangedError` messages that show
1685 the differences between new task edges and old task edges. Should
1686 include the fact that the rest of the message will refer to the old
1687 task as "A" and the new task as "B", and end with a colon.
1689 Raises
1690 ------
1691 ValueError
1692 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
1693 are both `True`, or if a full config is provided for a task after
1694 another full config or an override has already been provided.
1695 EdgesChangedError
1696 Raised if ``check_edges_unchanged=True`` and the edges of a task do
1697 change.
1698 """
1699 deep: dict[str, TaskNode] = {}
1700 shallow: dict[str, TaskNode] = {}
1701 if assume_edges_unchanged:
1702 if check_edges_unchanged:
1703 raise ValueError("Cannot simultaneously assume and check that edges have not changed.")
1704 shallow.update(updates)
1705 else:
1706 for task_label, new_task_node in updates.items():
1707 old_task_node = self.tasks[task_label]
1708 messages = old_task_node.diff_edges(new_task_node)
1709 if messages:
1710 if check_edges_unchanged:
1711 messages.insert(0, message_header.format(task_label=task_label))
1712 raise EdgesChangedError("\n".join(messages))
1713 else:
1714 deep[task_label] = new_task_node
1715 else:
1716 shallow[task_label] = new_task_node
1717 try:
1718 if deep:
1719 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True)
1720 self.add_task_nodes(deep.values())
1721 for replaced_task_node, referencing_subsets in removed:
1722 for subset_label in referencing_subsets:
1723 self._task_subsets[subset_label].add(replaced_task_node.label)
1724 for task_node in shallow.values():
1725 self._xgraph.nodes[task_node.key]["instance"] = task_node
1726 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init
1727 except PipelineGraphExceptionSafetyError: # pragma: no cover
1728 raise
1729 except Exception as err: # pragma: no cover
1730 # There's no known way to get here, but we want to make it clear
1731 # it's a big problem if we do.
1732 raise PipelineGraphExceptionSafetyError(
1733 "Error while replacing tasks has left the graph in an inconsistent state."
1734 ) from err
1736 def _append_graph_data_from_edge(
1737 self,
1738 node_data: list[tuple[NodeKey, dict[str, Any]]],
1739 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]],
1740 edge: Edge,
1741 parent: PipelineGraph | None,
1742 ) -> None:
1743 """Append networkx state dictionaries for an edge and the corresponding
1744 dataset type node.
1746 Parameters
1747 ----------
1748 node_data : `list`
1749 List of node keys and state dictionaries. A node is appended if
1750 one does not already exist for this dataset type.
1751 edge_data : `list`
1752 List of node key pairs, connection names, and state dictionaries
1753 for edges.
1754 edge : `Edge`
1755 New edge being processed.
1756 parent : `PipelineGraph` or `None`
1757 Another pipeline graph whose dataset type nodes should be used
1758 when present.
1759 """
1760 new_dataset_type_node = None
1761 if parent is not None:
1762 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance")
1763 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None:
1764 existing_dataset_type_state["instance"] = new_dataset_type_node
1765 else:
1766 node_data.append(
1767 (
1768 edge.dataset_type_key,
1769 {
1770 "instance": new_dataset_type_node,
1771 "bipartite": NodeType.DATASET_TYPE.bipartite,
1772 },
1773 )
1774 )
1775 edge_data.append(
1776 edge.nodes
1777 + (
1778 edge.connection_name,
1779 {"instance": edge},
1780 )
1781 )
1783 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None:
1784 """Set the order of all views of this graph from the given sorted
1785 sequence of task labels and dataset type names.
1786 """
1787 self._sorted_keys = sorted_keys
1788 self._tasks._reorder(sorted_keys)
1789 self._dataset_types._reorder(sorted_keys)
1791 def _reset(self) -> None:
1792 """Reset the all views of this graph following a modification that
1793 might invalidate them.
1794 """
1795 self._sorted_keys = None
1796 self._tasks._reset()
1797 self._dataset_types._reset()
1799 _xgraph: networkx.MultiDiGraph
1800 _sorted_keys: Sequence[NodeKey] | None
1801 _task_subsets: dict[str, TaskSubset]
1802 _description: str
1803 _tasks: TaskMappingView
1804 _dataset_types: DatasetTypeMappingView
1805 _raw_data_id: dict[str, Any]
1806 _universe: DimensionUniverse | None