Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 19%
407 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-20 02:43 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-20 02:43 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("PipelineGraph",)
31import gzip
32import itertools
33import json
34from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence, Set
35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast
37import networkx
38import networkx.algorithms.bipartite
39import networkx.algorithms.dag
40from lsst.daf.butler import DataCoordinate, DataId, DatasetType, DimensionGroup, DimensionUniverse, Registry
41from lsst.daf.butler.registry import MissingDatasetTypeError
42from lsst.resources import ResourcePath, ResourcePathExpression
44from ._dataset_types import DatasetTypeNode
45from ._edges import Edge, ReadEdge, WriteEdge
46from ._exceptions import (
47 DuplicateOutputError,
48 EdgesChangedError,
49 PipelineDataCycleError,
50 PipelineGraphError,
51 PipelineGraphExceptionSafetyError,
52 UnresolvedGraphError,
53)
54from ._mapping_views import DatasetTypeMappingView, TaskMappingView
55from ._nodes import NodeKey, NodeType
56from ._task_subsets import TaskSubset
57from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData
59if TYPE_CHECKING:
60 from ..config import PipelineTaskConfig
61 from ..connections import PipelineTaskConnections
62 from ..pipeline import TaskDef
63 from ..pipelineTask import PipelineTask
66_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph)
69class PipelineGraph:
70 """A graph representation of fully-configured pipeline.
72 `PipelineGraph` instances are typically constructed by calling
73 `.Pipeline.to_graph`, but in rare cases constructing and then populating an
74 empty one may be preferable.
76 Parameters
77 ----------
78 description : `str`, optional
79 String description for this pipeline.
80 universe : `lsst.daf.butler.DimensionUniverse`, optional
81 Definitions for all butler dimensions. If not provided, some
82 attributes will not be available until `resolve` is called.
83 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional
84 Data ID that represents a constraint on all quanta generated by this
85 pipeline. This typically just holds the instrument constraint included
86 in the pipeline definition, if there was one.
87 """
89 ###########################################################################
90 #
91 # Simple Pipeline Graph Inspection Interface:
92 #
93 # - for inspecting graph structure, not modifying it (except to sort and]
94 # resolve);
95 #
96 # - no NodeKey objects, just string dataset type name and task label keys;
97 #
98 # - graph structure is represented as a pair of mappings, with methods to
99 # find neighbors and edges of nodes.
100 #
101 ###########################################################################
103 def __init__(
104 self,
105 *,
106 description: str = "",
107 universe: DimensionUniverse | None = None,
108 data_id: DataId | None = None,
109 ) -> None:
110 self._init_from_args(
111 xgraph=None,
112 sorted_keys=None,
113 task_subsets=None,
114 description=description,
115 universe=universe,
116 data_id=data_id,
117 )
119 def __repr__(self) -> str:
120 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})"
122 @property
123 def description(self) -> str:
124 """String description for this pipeline."""
125 return self._description
127 @description.setter
128 def description(self, value: str) -> None:
129 # Docstring in setter.
130 self._description = value
132 @property
133 def universe(self) -> DimensionUniverse | None:
134 """Definitions for all butler dimensions."""
135 return self._universe
137 @property
138 def data_id(self) -> DataCoordinate:
139 """Data ID that represents a constraint on all quanta generated from
140 this pipeline.
142 This is may not be available unless `universe` is not `None`.
143 """
144 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe)
146 @property
147 def tasks(self) -> TaskMappingView:
148 """A mapping view of the tasks in the graph.
150 This mapping has `str` task label keys and `TaskNode` values. Iteration
151 is topologically and deterministically ordered if and only if `sort`
152 has been called since the last modification to the graph.
153 """
154 return self._tasks
156 @property
157 def dataset_types(self) -> DatasetTypeMappingView:
158 """A mapping view of the dataset types in the graph.
160 This mapping has `str` parent dataset type name keys, but only provides
161 access to its `DatasetTypeNode` values if `resolve` has been called
162 since the last modification involving a task that uses a dataset type.
163 See `DatasetTypeMappingView` for details.
164 """
165 return self._dataset_types
167 @property
168 def task_subsets(self) -> Mapping[str, TaskSubset]:
169 """A mapping of all labeled subsets of tasks.
171 Keys are subset labels, values are sets of task labels. See
172 `TaskSubset` for more information.
174 Use `add_task_subset` to add a new subset. The subsets themselves may
175 be modified in-place.
176 """
177 return self._task_subsets
179 @property
180 def is_fully_resolved(self) -> bool:
181 """Whether all of this graph's nodes are resolved."""
182 return self._universe is not None and all(
183 self.dataset_types.is_resolved(k) for k in self.dataset_types
184 )
186 @property
187 def is_sorted(self) -> bool:
188 """Whether this graph's tasks and dataset types are topologically
189 sorted with the exact same deterministic tiebreakers that `sort` would
190 apply.
192 This may perform (and then discard) a full sort if `has_been_sorted` is
193 `False`. If the goal is to obtain a sorted graph, it is better to just
194 call `sort` without guarding that with an ``if not graph.is_sorted``
195 check.
196 """
197 if self._sorted_keys is not None:
198 return True
199 return all(
200 sorted == unsorted
201 for sorted, unsorted in zip(
202 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True
203 )
204 )
206 @property
207 def has_been_sorted(self) -> bool:
208 """Whether this graph's tasks and dataset types have been
209 topologically sorted (with unspecified but deterministic tiebreakers)
210 since the last modification to the graph.
212 This may return `False` if the graph *happens* to be sorted but `sort`
213 was never called, but it is potentially much faster than `is_sorted`,
214 which may attempt (and then discard) a full sort if `has_been_sorted`
215 is `False`.
216 """
217 return self._sorted_keys is not None
219 def sort(self) -> None:
220 """Sort this graph's nodes topologically with deterministic (but
221 unspecified) tiebreakers.
223 This does nothing if the graph is already known to be sorted.
224 """
225 if self._sorted_keys is None:
226 try:
227 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph))
228 except networkx.NetworkXUnfeasible as err: # pragma: no cover
229 # Should't be possible to get here, because we check for cycles
230 # when adding tasks, but we guard against it anyway.
231 cycle = networkx.find_cycle(self._xgraph)
232 raise PipelineDataCycleError(
233 f"Cycle detected while attempting to sort graph: {cycle}."
234 ) from err
235 self._reorder(sorted_keys)
237 def copy(self) -> PipelineGraph:
238 """Return a copy of this graph that copies all mutable state."""
239 xgraph = self._xgraph.copy()
240 result = PipelineGraph.__new__(PipelineGraph)
241 result._init_from_args(
242 xgraph,
243 self._sorted_keys,
244 task_subsets={
245 k: TaskSubset(xgraph, v.label, set(v._members), v.description)
246 for k, v in self._task_subsets.items()
247 },
248 description=self._description,
249 universe=self.universe,
250 data_id=self._raw_data_id,
251 )
252 return result
254 def __copy__(self) -> PipelineGraph:
255 # Fully shallow copies are dangerous; we don't want shared mutable
256 # state to lead to broken class invariants.
257 return self.copy()
259 def __deepcopy__(self, memo: dict) -> PipelineGraph:
260 # Genuine deep copies are unnecessary, since we should only ever care
261 # that mutable state is copied.
262 return self.copy()
264 def diff_tasks(self, other: PipelineGraph) -> list[str]:
265 """Compare two pipeline graphs.
267 This only compares graph structure and task classes (including their
268 edges). It does *not* compare full configuration (which is subject to
269 spurious differences due to import-cache state), dataset type
270 resolutions, or sort state.
272 Parameters
273 ----------
274 other : `PipelineGraph`
275 Graph to compare to.
277 Returns
278 -------
279 differences : `list` [ `str` ]
280 List of string messages describing differences between the
281 pipelines. If empty, the graphs have the same tasks and
282 connections.
283 """
284 messages: list[str] = []
285 common_labels: Set[str]
286 if self.tasks.keys() != other.tasks.keys():
287 common_labels = self.tasks.keys() & other.tasks.keys()
288 messages.append(
289 f"Pipelines have different tasks: A & ~B = {list(self.tasks.keys() - common_labels)}, "
290 f"B & ~A = {list(other.tasks.keys() - common_labels)}."
291 )
292 else:
293 common_labels = self.tasks.keys()
294 for label in common_labels:
295 a = self.tasks[label]
296 b = other.tasks[label]
297 if a.task_class != b.task_class:
298 messages.append(
299 f"Task {label!r} has class {a.task_class_name} in A, " f"but {b.task_class_name} in B."
300 )
301 messages.extend(a.diff_edges(b))
302 return messages
304 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None:
305 """Return the `WriteEdge` that links the producing task to the named
306 dataset type.
308 Parameters
309 ----------
310 dataset_type_name : `str`
311 Dataset type name. Must not be a component.
313 Returns
314 -------
315 edge : `WriteEdge` or `None`
316 Producing edge or `None` if there isn't one in this graph.
318 Raises
319 ------
320 DuplicateOutputError
321 Raised if there are multiple tasks defined to produce this dataset
322 type. This is only possible if the graph's dataset types are not
323 resolved.
325 Notes
326 -----
327 On resolved graphs, it may be slightly more efficient to use::
329 graph.dataset_types[dataset_type_name].producing_edge
331 but this method works on graphs with unresolved dataset types as well.
332 """
333 producer: str | None = None
334 producing_edge: WriteEdge | None = None
335 for _, _, producing_edge in self._xgraph.in_edges(
336 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
337 ):
338 assert producing_edge is not None, "Should only be None if we never loop."
339 if producer is not None:
340 raise DuplicateOutputError(
341 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} "
342 f"and {producer!r}."
343 )
344 return producing_edge
346 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]:
347 """Return the `ReadEdge` objects that link the named dataset type to
348 the tasks that consume it.
350 Parameters
351 ----------
352 dataset_type_name : `str`
353 Dataset type name. Must not be a component.
355 Returns
356 -------
357 edges : `list` [ `ReadEdge` ]
358 Edges that connect this dataset type to the tasks that consume it.
360 Notes
361 -----
362 On resolved graphs, it may be slightly more efficient to use::
364 graph.dataset_types[dataset_type_name].producing_edges
366 but this method works on graphs with unresolved dataset types as well.
367 """
368 return [
369 edge
370 for _, _, edge in self._xgraph.out_edges(
371 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
372 )
373 ]
375 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None:
376 """Return the `TaskNode` or `TaskInitNode` that writes the given
377 dataset type.
379 Parameters
380 ----------
381 dataset_type_name : `str`
382 Dataset type name. Must not be a component.
384 Returns
385 -------
386 edge : `TaskNode`, `TaskInitNode`, or `None`
387 Producing node or `None` if there isn't one in this graph.
389 Raises
390 ------
391 DuplicateOutputError
392 Raised if there are multiple tasks defined to produce this dataset
393 type. This is only possible if the graph's dataset types are not
394 resolved.
395 """
396 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None:
397 return self._xgraph.nodes[producing_edge.task_key]["instance"]
398 return None
400 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]:
401 """Return the `TaskNode` and/or `TaskInitNode` objects that read
402 the given dataset type.
404 Parameters
405 ----------
406 dataset_type_name : `str`
407 Dataset type name. Must not be a component.
409 Returns
410 -------
411 edges : `list` [ `ReadEdge` ]
412 Edges that connect this dataset type to the tasks that consume it.
414 Notes
415 -----
416 On resolved graphs, it may be slightly more efficient to use::
418 graph.dataset_types[dataset_type_name].producing_edges
420 but this method works on graphs with unresolved dataset types as well.
421 """
422 return [
423 self._xgraph.nodes[consuming_edge.task_key]["instance"]
424 for consuming_edge in self.consuming_edges_of(dataset_type_name)
425 ]
427 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]:
428 """Return the dataset types that are inputs to a task.
430 Parameters
431 ----------
432 task_label : `str`
433 Label for the task in the pipeline.
434 init : `bool`, optional
435 If `True`, return init-input dataset types instead of runtime
436 (including prerequisite) inputs.
438 Returns
439 -------
440 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
441 Dictionary parent dataset type name keys and either
442 `DatasetTypeNode` values (if the dataset type has been resolved)
443 or `None` values.
445 Notes
446 -----
447 To get the input edges of a task or task init node (which provide
448 information about storage class overrides nd components) use::
450 graph.tasks[task_label].iter_all_inputs()
452 or
454 graph.tasks[task_label].init.iter_all_inputs()
456 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
457 class.
458 """
459 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
460 return {
461 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
462 for edge in node.iter_all_inputs()
463 }
465 def outputs_of(
466 self, task_label: str, init: bool = False, include_automatic_connections: bool = True
467 ) -> dict[str, DatasetTypeNode | None]:
468 """Return the dataset types that are outputs of a task.
470 Parameters
471 ----------
472 task_label : `str`
473 Label for the task in the pipeline.
474 init : `bool`, optional
475 If `True`, return init-output dataset types instead of runtime
476 outputs.
477 include_automatic_connections : `bool`, optional
478 Whether to include automatic connections such as configs, metadata,
479 and logs.
481 Returns
482 -------
483 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
484 Dictionary parent dataset type name keys and either
485 `DatasetTypeNode` values (if the dataset type has been resolved)
486 or `None` values.
488 Notes
489 -----
490 To get the input edges of a task or task init node (which provide
491 information about storage class overrides nd components) use::
493 graph.tasks[task_label].iter_all_outputs()
495 or
497 graph.tasks[task_label].init.iter_all_outputs()
499 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
500 class.
501 """
502 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
503 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values()
504 return {
505 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
506 for edge in iterable
507 }
509 def resolve(
510 self,
511 registry: Registry | None = None,
512 dimensions: DimensionUniverse | None = None,
513 dataset_types: Mapping[str, DatasetType] | None = None,
514 ) -> None:
515 """Resolve all dimensions and dataset types and check them for
516 consistency.
518 Resolving a graph also causes it to be sorted.
520 Parameters
521 ----------
522 registry : `lsst.daf.butler.Registry`, optional
523 Client for the data repository to resolve against. If not
524 provided, both ``dimensions`` and ``dataset_types`` must be.
525 dimensions : `lsst.daf.butler.DimensionUniverse`, optional
526 Definitions for all dimensions.
527 dataset_types : `~collection.abc.Mapping` [ `str`, \
528 `~lsst.daf.butler.DatasetType` ], optional
529 Mapping of dataset types to consider registered.
531 Notes
532 -----
533 The `universe` attribute is set to ``dimensions`` and used to set all
534 `TaskNode.dimensions` attributes. Dataset type nodes are resolved by
535 first looking for a registry definition, then using the producing
536 task's definition, then looking for consistency between all consuming
537 task definitions.
539 Raises
540 ------
541 ConnectionTypeConsistencyError
542 Raised if a prerequisite input for one task appears as a different
543 kind of connection in any other task.
544 DuplicateOutputError
545 Raised if multiple tasks have the same dataset type as an output.
546 IncompatibleDatasetTypeError
547 Raised if different tasks have different definitions of a dataset
548 type. Different but compatible storage classes are permitted.
549 MissingDatasetTypeError
550 Raised if a dataset type definition is required to exist in the
551 data repository but none was found. This should only occur for
552 dataset types that are not produced by a task in the pipeline and
553 are consumed with different storage classes or as components by
554 tasks in the pipeline.
555 EdgesChangedError
556 Raised if ``check_edges_unchanged=True`` and the edges of a task do
557 change after import and reconfiguration.
558 """
559 get_registered: Callable[[str], DatasetType | None]
560 if registry is None:
561 if dimensions is None or dataset_types is None:
562 raise PipelineGraphError(
563 "Either 'registry' or both 'dimensions' and 'dataset_types' "
564 "must be passed to PipelineGraph.resolve."
565 )
567 else:
568 if dimensions is None:
569 dimensions = registry.dimensions
571 def get_registered(name: str) -> DatasetType | None:
572 try:
573 return registry.getDatasetType(name)
574 except MissingDatasetTypeError:
575 return None
577 if dataset_types is not None:
578 # Ruff seems confused about whether this is used below; it is!
579 get_registered = dataset_types.get
580 node_key: NodeKey
581 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {}
582 for node_key, node_state in self._xgraph.nodes.items():
583 match node_key.node_type:
584 case NodeType.TASK:
585 task_node: TaskNode = node_state["instance"]
586 new_task_node = task_node._resolved(dimensions)
587 if new_task_node is not task_node:
588 updates[node_key] = new_task_node
589 case NodeType.DATASET_TYPE:
590 dataset_type_node: DatasetTypeNode | None = node_state["instance"]
591 new_dataset_type_node = DatasetTypeNode._from_edges(
592 node_key, self._xgraph, get_registered, dimensions, previous=dataset_type_node
593 )
594 # Usage of `is`` here is intentional; `_from_edges` returns
595 # `previous=dataset_type_node` if it can determine that it
596 # doesn't need to change.
597 if new_dataset_type_node is not dataset_type_node:
598 updates[node_key] = new_dataset_type_node
599 try:
600 for node_key, node_value in updates.items():
601 self._xgraph.nodes[node_key]["instance"] = node_value
602 except Exception as err: # pragma: no cover
603 # There's no known way to get here, but we want to make it
604 # clear it's a big problem if we do.
605 raise PipelineGraphExceptionSafetyError(
606 "Error during dataset type resolution has left the graph in an inconsistent state."
607 ) from err
608 self.sort()
609 self._universe = dimensions
611 ###########################################################################
612 #
613 # Graph Modification Interface:
614 #
615 # - methods to add, remove, and replace tasks;
616 #
617 # - methods to add and remove task subsets.
618 #
619 # These are all things that are usually done in a Pipeline before making a
620 # graph at all, but there may be cases where we want to modify the graph
621 # instead. (These are also the methods used to make a graph from a
622 # Pipeline, or make a graph from another graph.)
623 #
624 ###########################################################################
626 def add_task(
627 self,
628 label: str | None,
629 task_class: type[PipelineTask],
630 config: PipelineTaskConfig | None = None,
631 connections: PipelineTaskConnections | None = None,
632 ) -> TaskNode:
633 """Add a new task to the graph.
635 Parameters
636 ----------
637 label : `str` or `None`
638 Label for the task in the pipeline. If `None`, `Task._DefaultName`
639 is used.
640 task_class : `type` [ `PipelineTask` ]
641 Class object for the task.
642 config : `PipelineTaskConfig`, optional
643 Configuration for the task. If not provided, a default-constructed
644 instance of ``task_class.ConfigClass`` is used.
645 connections : `PipelineTaskConnections`, optional
646 Object that describes the dataset types used by the task. If not
647 provided, one will be constructed from the given configuration. If
648 provided, it is assumed that ``config`` has already been validated
649 and frozen.
651 Returns
652 -------
653 node : `TaskNode`
654 The new task node added to the graph.
656 Raises
657 ------
658 ValueError
659 Raised if configuration validation failed when constructing
660 ``connections``.
661 PipelineDataCycleError
662 Raised if the graph is cyclic after this addition.
663 RuntimeError
664 Raised if an unexpected exception (which will be chained) occurred
665 at a stage that may have left the graph in an inconsistent state.
666 Other exceptions should leave the graph unchanged.
668 Notes
669 -----
670 Checks for dataset type consistency and multiple producers do not occur
671 until `resolve` is called, since the resolution depends on both the
672 state of the data repository and all contributing tasks.
674 Adding new tasks removes any existing resolutions of all dataset types
675 it references and marks the graph as unsorted. It is most effiecient
676 to add all tasks up front and only then resolve and/or sort the graph.
677 """
678 if label is None:
679 label = task_class._DefaultName
680 if config is None:
681 config = task_class.ConfigClass()
682 task_node = TaskNode._from_imported_data(
683 key=NodeKey(NodeType.TASK, label),
684 init_key=NodeKey(NodeType.TASK_INIT, label),
685 data=_TaskNodeImportedData.configure(label, task_class, config, connections),
686 universe=self.universe,
687 )
688 self.add_task_nodes([task_node])
689 return task_node
691 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None:
692 """Add one or more existing task nodes to the graph.
694 Parameters
695 ----------
696 nodes : `~collections.abc.Iterable` [ `TaskNode` ]
697 Iterable of task nodes to add. If any tasks have resolved
698 dimensions, they must have the same dimension universe as the rest
699 of the graph.
700 parent : `PipelineGraph`, optional
701 If provided, another `PipelineGraph` from which these nodes were
702 obtained. Any dataset type nodes already present in ``parent``
703 that are referenced by the given tasks will be used in this graph
704 if they are not already present, preserving any dataset type
705 resolutions present in the parent graph. Adding nodes from a
706 parent graph after the graph has its own nodes (e.g. from
707 `add_task`) or nodes from a third graph may result in invalid
708 dataset type resolutions. It is safest to only use this argument
709 when populating an empty graph for the first time.
711 Raises
712 ------
713 PipelineDataCycleError
714 Raised if the graph is cyclic after this addition.
716 Notes
717 -----
718 Checks for dataset type consistency and multiple producers do not occur
719 until `resolve` is called, since the resolution depends on both the
720 state of the data repository and all contributing tasks.
722 Adding new tasks removes any existing resolutions of all dataset types
723 it references (unless ``parent is not None`` and marks the graph as
724 unsorted. It is most efficient to add all tasks up front and only then
725 resolve and/or sort the graph.
726 """
727 node_data: list[tuple[NodeKey, dict[str, Any]]] = []
728 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = []
729 for task_node in nodes:
730 task_node = task_node._resolved(self._universe)
731 node_data.append(
732 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite})
733 )
734 node_data.append(
735 (
736 task_node.init.key,
737 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite},
738 )
739 )
740 # Convert the edge objects attached to the task node to networkx.
741 for read_edge in task_node.init.iter_all_inputs():
742 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
743 for write_edge in task_node.init.iter_all_outputs():
744 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
745 for read_edge in task_node.iter_all_inputs():
746 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
747 for write_edge in task_node.iter_all_outputs():
748 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
749 # Add a special edge (with no Edge instance) that connects the
750 # TaskInitNode to the runtime TaskNode.
751 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None}))
752 if not node_data and not edge_data:
753 return
754 # Checks and preparation complete; time to start the actual
755 # modification, during which it's hard to provide strong exception
756 # safety. Start by resetting the sort ordering, if there is one.
757 self._reset()
758 try:
759 self._xgraph.add_nodes_from(node_data)
760 self._xgraph.add_edges_from(edge_data)
761 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph):
762 cycle = networkx.find_cycle(self._xgraph)
763 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.")
764 except Exception:
765 # First try to roll back our changes.
766 try:
767 self._xgraph.remove_edges_from(edge_data)
768 self._xgraph.remove_nodes_from(key for key, _ in node_data)
769 except Exception as err: # pragma: no cover
770 # There's no known way to get here, but we want to make it
771 # clear it's a big problem if we do.
772 raise PipelineGraphExceptionSafetyError(
773 "Error while attempting to revert PipelineGraph modification has left the graph in "
774 "an inconsistent state."
775 ) from err
776 # Successfully rolled back; raise the original exception.
777 raise
779 def reconfigure_tasks(
780 self,
781 *args: tuple[str, PipelineTaskConfig],
782 check_edges_unchanged: bool = False,
783 assume_edges_unchanged: bool = False,
784 **kwargs: PipelineTaskConfig,
785 ) -> None:
786 """Update the configuration for one or more tasks.
788 Parameters
789 ----------
790 *args : `tuple` [ `str`, `.PipelineTaskConfig` ]
791 Positional arguments are each a 2-tuple of task label and new
792 config object. Note that the same arguments may also be passed as
793 ``**kwargs``, which is usually more readable, but task labels in
794 ``*args`` are not required to be valid Python identifiers.
795 check_edges_unchanged : `bool`, optional
796 If `True`, require the edges (connections) of the modified tasks to
797 remain unchanged after the configuration updates, and verify that
798 this is the case.
799 assume_edges_unchanged : `bool`, optional
800 If `True`, the caller declares that the edges (connections) of the
801 modified tasks will remain unchanged after the configuration
802 updates, and that it is unnecessary to check this.
803 **kwargs : `.PipelineTaskConfig`
804 New config objects or overrides to apply to copies of the current
805 config objects, with task labels as the keywords.
807 Raises
808 ------
809 ValueError
810 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
811 are both `True`, or if the same task appears twice.
812 EdgesChangedError
813 Raised if ``check_edges_unchanged=True`` and the edges of a task do
814 change.
816 Notes
817 -----
818 If reconfiguring a task causes its edges to change, any dataset type
819 nodes connected to that task (not just those whose edges have changed!)
820 will be unresolved.
821 """
822 new_configs: dict[str, PipelineTaskConfig] = {}
823 for task_label, config_update in itertools.chain(args, kwargs.items()):
824 if new_configs.setdefault(task_label, config_update) is not config_update:
825 raise ValueError(f"Config for {task_label!r} provided more than once.")
826 updates = {
827 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged)
828 for task_label, config in new_configs.items()
829 }
830 self._replace_task_nodes(
831 updates,
832 check_edges_unchanged=check_edges_unchanged,
833 assume_edges_unchanged=assume_edges_unchanged,
834 message_header=(
835 "Unexpected change in edges for task {task_label!r} from original config (A) to "
836 "new configs (B):"
837 ),
838 )
840 def remove_tasks(
841 self, labels: Iterable[str], drop_from_subsets: bool = True
842 ) -> list[tuple[TaskNode, set[str]]]:
843 """Remove one or more tasks from the graph.
845 Parameters
846 ----------
847 labels : `~collections.abc.Iterable` [ `str` ]
848 Iterable of the labels of the tasks to remove.
849 drop_from_subsets : `bool`, optional
850 If `True`, drop each removed task from any subset in which it
851 currently appears. If `False`, raise `PipelineGraphError` if any
852 such subsets exist.
854 Returns
855 -------
856 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ]
857 List of nodes removed and the labels of task subsets that
858 referenced them.
860 Raises
861 ------
862 PipelineGraphError
863 Raised if ``drop_from_subsets`` is `False` and the task is still
864 part of one or more subsets.
866 Notes
867 -----
868 Removing a task will cause dataset nodes with no other referencing
869 tasks to be removed. Any other dataset type nodes referenced by a
870 removed task will be reset to an "unresolved" state.
871 """
872 task_nodes_and_subsets = []
873 dataset_types: set[NodeKey] = set()
874 nodes_to_remove = set()
875 for label in labels:
876 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"]
877 # Find task subsets that reference this task.
878 referencing_subsets = {
879 subset_label
880 for subset_label, task_subset in self.task_subsets.items()
881 if label in task_subset
882 }
883 if not drop_from_subsets and referencing_subsets:
884 raise PipelineGraphError(
885 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}."
886 )
887 task_nodes_and_subsets.append((task_node, referencing_subsets))
888 # Find dataset types referenced by this task.
889 dataset_types.update(self._xgraph.predecessors(task_node.key))
890 dataset_types.update(self._xgraph.successors(task_node.key))
891 dataset_types.update(self._xgraph.predecessors(task_node.init.key))
892 dataset_types.update(self._xgraph.successors(task_node.init.key))
893 # Since there's an edge between the task and its init node, we'll
894 # have added those two nodes here, too, and we don't want that.
895 dataset_types.remove(task_node.init.key)
896 dataset_types.remove(task_node.key)
897 # Mark the task node and its init node for removal from the graph.
898 nodes_to_remove.add(task_node.key)
899 nodes_to_remove.add(task_node.init.key)
900 # Process the referenced datasets to see which ones are orphaned and
901 # need to be removed vs. just unresolved.
902 nodes_to_unresolve = []
903 for dataset_type_key in dataset_types:
904 related_tasks = set()
905 related_tasks.update(self._xgraph.predecessors(dataset_type_key))
906 related_tasks.update(self._xgraph.successors(dataset_type_key))
907 related_tasks.difference_update(nodes_to_remove)
908 if not related_tasks:
909 nodes_to_remove.add(dataset_type_key)
910 else:
911 nodes_to_unresolve.append(dataset_type_key)
912 # Checks and preparation complete; time to start the actual
913 # modification, during which it's hard to provide strong exception
914 # safety. Start by resetting the sort ordering.
915 self._reset()
916 try:
917 for dataset_type_key in nodes_to_unresolve:
918 self._xgraph.nodes[dataset_type_key]["instance"] = None
919 for task_node, referencing_subsets in task_nodes_and_subsets:
920 for subset_label in referencing_subsets:
921 self._task_subsets[subset_label].remove(task_node.label)
922 self._xgraph.remove_nodes_from(nodes_to_remove)
923 except Exception as err: # pragma: no cover
924 # There's no known way to get here, but we want to make it
925 # clear it's a big problem if we do.
926 raise PipelineGraphExceptionSafetyError(
927 "Error during task removal has left the graph in an inconsistent state."
928 ) from err
929 return task_nodes_and_subsets
931 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None:
932 """Add a label for a set of tasks that are already in the pipeline.
934 Parameters
935 ----------
936 subset_label : `str`
937 Label for this set of tasks.
938 task_labels : `~collections.abc.Iterable` [ `str` ]
939 Labels of the tasks to include in the set. All must already be
940 included in the graph.
941 description : `str`, optional
942 String description to associate with this label.
943 """
944 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description)
945 self._task_subsets[subset_label] = subset
947 def remove_task_subset(self, subset_label: str) -> None:
948 """Remove a labeled set of tasks.
950 Parameters
951 ----------
952 subset_label : `str`
953 Label for this set of tasks.
954 """
955 del self._task_subsets[subset_label]
957 ###########################################################################
958 #
959 # NetworkX Export Interface:
960 #
961 # - methods to export the PipelineGraph's content (or various subsets
962 # thereof) as NetworkX objects.
963 #
964 # These are particularly useful when writing tools to visualize the graph,
965 # while providing options for which aspects of the graph (tasks, dataset
966 # types, or both) to include, since all exported graphs have similar
967 # attributes regardless of their structure.
968 #
969 ###########################################################################
971 def make_xgraph(self) -> networkx.MultiDiGraph:
972 """Export a networkx representation of the full pipeline graph,
973 including both init and runtime edges.
975 Returns
976 -------
977 xgraph : `networkx.MultiDiGraph`
978 Directed acyclic graph with parallel edges.
980 Notes
981 -----
982 The returned graph uses `NodeKey` instances for nodes. Parallel edges
983 represent the same dataset type appearing in multiple connections for
984 the same task, and are hence rare. The connection name is used as the
985 edge key to disambiguate those parallel edges.
987 Almost all edges connect dataset type nodes to task or task init nodes
988 or vice versa, but there is also a special edge that connects each task
989 init node to its runtime node. The existence of these edges makes the
990 graph not quite bipartite, though its init-only and runtime-only
991 subgraphs are bipartite.
993 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
994 `WriteEdge` for the descriptive node and edge attributes added.
995 """
996 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False)
998 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph:
999 """Return a bipartite networkx representation of just the runtime or
1000 init-time pipeline graph.
1002 Parameters
1003 ----------
1004 init : `bool`, optional
1005 If `True` (`False` is default) return the graph of task
1006 initialization nodes and init input/output dataset types, instead
1007 of the graph of runtime task nodes and regular
1008 input/output/prerequisite dataset types.
1010 Returns
1011 -------
1012 xgraph : `networkx.MultiDiGraph`
1013 Directed acyclic graph with parallel edges.
1015 Notes
1016 -----
1017 The returned graph uses `NodeKey` instances for nodes. Parallel edges
1018 represent the same dataset type appearing in multiple connections for
1019 the same task, and are hence rare. The connection name is used as the
1020 edge key to disambiguate those parallel edges.
1022 This graph is bipartite because each dataset type node only has edges
1023 that connect it to a task [init] node, and vice versa.
1025 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
1026 `WriteEdge` for the descriptive node and edge attributes added.
1027 """
1028 return self._transform_xgraph_state(
1029 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False
1030 )
1032 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph:
1033 """Return a networkx representation of just the tasks in the pipeline.
1035 Parameters
1036 ----------
1037 init : `bool`, optional
1038 If `True` (`False` is default) return the graph of task
1039 initialization nodes, instead of the graph of runtime task nodes.
1041 Returns
1042 -------
1043 xgraph : `networkx.DiGraph`
1044 Directed acyclic graph with no parallel edges.
1046 Notes
1047 -----
1048 The returned graph uses `NodeKey` instances for nodes. The dataset
1049 types that link these tasks are not represented at all; edges have no
1050 attributes, and there are no parallel edges.
1052 See `TaskNode` and `TaskInitNode` for the descriptive node and
1053 attributes added.
1054 """
1055 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
1056 task_keys = [
1057 key
1058 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1059 if bipartite == NodeType.TASK.bipartite
1060 ]
1061 return self._transform_xgraph_state(
1062 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys),
1063 skip_edges=True,
1064 )
1066 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph:
1067 """Return a networkx representation of just the dataset types in the
1068 pipeline.
1070 Parameters
1071 ----------
1072 init : `bool`, optional
1073 If `True` (`False` is default) return the graph of init input and
1074 output dataset types, instead of the graph of runtime (input,
1075 output, prerequisite input) dataset types.
1077 Returns
1078 -------
1079 xgraph : `networkx.DiGraph`
1080 Directed acyclic graph with no parallel edges.
1082 Notes
1083 -----
1084 The returned graph uses `NodeKey` instances for nodes. The tasks that
1085 link these tasks are not represented at all; edges have no attributes,
1086 and there are no parallel edges.
1088 See `DatasetTypeNode` for the descriptive node and attributes added.
1089 """
1090 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
1091 dataset_type_keys = [
1092 key
1093 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1094 if bipartite == NodeType.DATASET_TYPE.bipartite
1095 ]
1096 return self._transform_xgraph_state(
1097 networkx.algorithms.bipartite.projected_graph(
1098 networkx.DiGraph(bipartite_xgraph), dataset_type_keys
1099 ),
1100 skip_edges=True,
1101 )
1103 ###########################################################################
1104 #
1105 # Serialization Interface.
1106 #
1107 # Serialization of PipelineGraphs is currently experimental and may not be
1108 # retained in the future. All serialization methods are
1109 # underscore-prefixed to ensure nobody mistakes them for a stable interface
1110 # (let a lone a stable file format).
1111 #
1112 ###########################################################################
1114 @classmethod
1115 def _read_stream(
1116 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1117 ) -> PipelineGraph:
1118 """Read a serialized `PipelineGraph` from a file-like object.
1120 Parameters
1121 ----------
1122 stream : `BinaryIO`
1123 File-like object opened for binary reading, containing
1124 gzip-compressed JSON.
1125 import_mode : `TaskImportMode`, optional
1126 Whether to import tasks, and how to reconcile any differences
1127 between the imported task's connections and the those that were
1128 persisted with the graph. Default is to check that they are the
1129 same.
1131 Returns
1132 -------
1133 graph : `PipelineGraph`
1134 Deserialized pipeline graph.
1136 Raises
1137 ------
1138 PipelineGraphReadError
1139 Raised if the serialized `PipelineGraph` is not self-consistent.
1140 EdgesChangedError
1141 Raised if ``import_mode`` is
1142 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1143 did change after import and reconfiguration.
1145 Notes
1146 -----
1147 `PipelineGraph` serialization is currently experimental and may be
1148 removed or significantly changed in the future, with no deprecation
1149 period.
1150 """
1151 from .io import SerializedPipelineGraph
1153 with gzip.open(stream, "rb") as uncompressed_stream:
1154 data = json.load(uncompressed_stream)
1155 serialized_graph = SerializedPipelineGraph.model_validate(data)
1156 return serialized_graph.deserialize(import_mode)
1158 @classmethod
1159 def _read_uri(
1160 cls,
1161 uri: ResourcePathExpression,
1162 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES,
1163 ) -> PipelineGraph:
1164 """Read a serialized `PipelineGraph` from a file at a URI.
1166 Parameters
1167 ----------
1168 uri : convertible to `lsst.resources.ResourcePath`
1169 URI to a gzip-compressed JSON file containing a serialized pipeline
1170 graph.
1171 import_mode : `TaskImportMode`, optional
1172 Whether to import tasks, and how to reconcile any differences
1173 between the imported task's connections and the those that were
1174 persisted with the graph. Default is to check that they are the
1175 same.
1177 Returns
1178 -------
1179 graph : `PipelineGraph`
1180 Deserialized pipeline graph.
1182 Raises
1183 ------
1184 PipelineGraphReadError
1185 Raised if the serialized `PipelineGraph` is not self-consistent.
1186 EdgesChangedError
1187 Raised if ``import_mode`` is
1188 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1189 did change after import and reconfiguration.
1191 Notes
1192 -----
1193 `PipelineGraph` serialization is currently experimental and may be
1194 removed or significantly changed in the future, with no deprecation
1195 period.
1196 """
1197 uri = ResourcePath(uri)
1198 with uri.open("rb") as stream:
1199 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode)
1201 def _write_stream(self, stream: BinaryIO) -> None:
1202 """Write the pipeline to a file-like object.
1204 Parameters
1205 ----------
1206 stream
1207 File-like object opened for binary writing.
1209 Notes
1210 -----
1211 `PipelineGraph` serialization is currently experimental and may be
1212 removed or significantly changed in the future, with no deprecation
1213 period.
1215 The file format is gzipped JSON, and is intended to be human-readable,
1216 but it should not be considered a stable public interface for outside
1217 code, which should always use `PipelineGraph` methods (or at least the
1218 `io.SerializedPipelineGraph` class) to read these files.
1219 """
1220 from .io import SerializedPipelineGraph
1222 with gzip.open(stream, mode="wb") as compressed_stream:
1223 compressed_stream.write(
1224 SerializedPipelineGraph.serialize(self).model_dump_json(exclude_defaults=True).encode("utf-8")
1225 )
1227 def _write_uri(self, uri: ResourcePathExpression) -> None:
1228 """Write the pipeline to a file given a URI.
1230 Parameters
1231 ----------
1232 uri : convertible to `lsst.resources.ResourcePath`
1233 URI to write to . May have ``.json.gz`` or no extension (which
1234 will cause a ``.json.gz`` extension to be added).
1236 Notes
1237 -----
1238 `PipelineGraph` serialization is currently experimental and may be
1239 removed or significantly changed in the future, with no deprecation
1240 period.
1242 The file format is gzipped JSON, and is intended to be human-readable,
1243 but it should not be considered a stable public interface for outside
1244 code, which should always use `PipelineGraph` methods (or at least the
1245 `io.SerializedPipelineGraph` class) to read these files.
1246 """
1247 uri = ResourcePath(uri)
1248 extension = uri.getExtension()
1249 if not extension:
1250 uri = uri.updatedExtension(".json.gz")
1251 elif extension != ".json.gz":
1252 raise ValueError("Expanded pipeline files should always have a .json.gz extension.")
1253 with uri.open(mode="wb") as stream:
1254 self._write_stream(cast(BinaryIO, stream))
1256 def _import_and_configure(
1257 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1258 ) -> None:
1259 """Import the `PipelineTask` classes referenced by all task nodes and
1260 update those nodes accordingly.
1262 Parameters
1263 ----------
1264 import_mode : `TaskImportMode`, optional
1265 Whether to import tasks, and how to reconcile any differences
1266 between the imported task's connections and the those that were
1267 persisted with the graph. Default is to check that they are the
1268 same. This method does nothing if this is
1269 `TaskImportMode.DO_NOT_IMPORT`.
1271 Raises
1272 ------
1273 EdgesChangedError
1274 Raised if ``import_mode`` is
1275 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1276 did change after import and reconfiguration.
1278 Notes
1279 -----
1280 This method shouldn't need to be called unless the graph was
1281 deserialized without importing and configuring immediately, which is
1282 not the default behavior (but it can greatly speed up deserialization).
1283 If all tasks have already been imported this does nothing.
1285 Importing and configuring a task can change its
1286 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output,
1287 usually because the software used to read a serialized graph is newer
1288 than the software used to write it (e.g. a new config option has been
1289 added, or the task was moved to a new module with a forwarding alias
1290 left behind). These changes are allowed by
1291 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`.
1293 If importing and configuring a task causes its edges to change, any
1294 dataset type nodes linked to those edges will be reset to the
1295 unresolved state.
1296 """
1297 if import_mode is TaskImportMode.DO_NOT_IMPORT:
1298 return
1299 rebuild = (
1300 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES
1301 or import_mode is TaskImportMode.OVERRIDE_EDGES
1302 )
1303 updates: dict[str, TaskNode] = {}
1304 node_key: NodeKey
1305 for node_key, node_state in self._xgraph.nodes.items():
1306 if node_key.node_type is NodeType.TASK:
1307 task_node: TaskNode = node_state["instance"]
1308 new_task_node = task_node._imported_and_configured(rebuild)
1309 if new_task_node is not task_node:
1310 updates[task_node.label] = new_task_node
1311 self._replace_task_nodes(
1312 updates,
1313 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES),
1314 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES),
1315 message_header=(
1316 "In task with label {task_label!r}, persisted edges (A)"
1317 "differ from imported and configured edges (B):"
1318 ),
1319 )
1321 ###########################################################################
1322 #
1323 # Advanced PipelineGraph Inspection Interface:
1324 #
1325 # - methods to iterate over all nodes and edges, utilizing NodeKeys;
1326 #
1327 # - methods to find overall inputs and group nodes by their dimensions,
1328 # which are important operations for QuantumGraph generation.
1329 #
1330 ###########################################################################
1332 def iter_edges(self, init: bool = False) -> Iterator[Edge]:
1333 """Iterate over edges in the graph.
1335 Parameters
1336 ----------
1337 init : `bool`, optional
1338 If `True` (`False` is default) iterate over the edges between task
1339 initialization node and init input/output dataset types, instead of
1340 the runtime task nodes and regular input/output/prerequisite
1341 dataset types.
1343 Returns
1344 -------
1345 edges : `~collections.abc.Iterator` [ `Edge` ]
1346 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances.
1348 Notes
1349 -----
1350 This method always returns _either_ init edges or runtime edges, never
1351 both. The full (internal) graph that contains both also includes a
1352 special edge that connects each task init node to its runtime node;
1353 that is also never returned by this method, since it is never a part of
1354 the init-only or runtime-only subgraphs.
1355 """
1356 edge: Edge
1357 for _, _, edge in self._xgraph.edges(data="instance"):
1358 if edge is not None and edge.is_init == init:
1359 yield edge
1361 def iter_nodes(
1362 self,
1363 ) -> Iterator[
1364 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode]
1365 | tuple[Literal[NodeType.TASK], str, TaskInitNode]
1366 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None]
1367 ]:
1368 """Iterate over nodes in the graph.
1370 Returns
1371 -------
1372 nodes : `~collections.abc.Iterator` [ `tuple` ]
1373 A lazy iterator over all of the nodes in the graph. Each yielded
1374 element is a tuple of:
1376 - the node type enum value (`NodeType`);
1377 - the string name for the node (task label or parent dataset type
1378 name);
1379 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`,
1380 or `None` for dataset type nodes that have not been resolved).
1381 """
1382 key: NodeKey
1383 if self._sorted_keys is not None:
1384 for key in self._sorted_keys:
1385 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore
1386 else:
1387 for key, node in self._xgraph.nodes(data="instance"):
1388 yield key.node_type, key.name, node # type: ignore
1390 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]:
1391 """Iterate over all of the dataset types that are consumed but not
1392 produced by the graph.
1394 Returns
1395 -------
1396 dataset_types : `~collections.abc.Iterator` [ `tuple` ]
1397 A lazy iterator over the overall-input dataset types (including
1398 overall init inputs and prerequisites). Each yielded element is a
1399 tuple of:
1401 - the parent dataset type name;
1402 - the resolved `DatasetTypeNode`, or `None` if the dataset type has
1403 - not been resolved.
1404 """
1405 for generation in networkx.algorithms.dag.topological_generations(self._xgraph):
1406 key: NodeKey
1407 for key in generation:
1408 # While we expect all tasks to have at least one input and
1409 # hence never appear in the first topological generation, that
1410 # is not true of task init nodes.
1411 if key.node_type is NodeType.DATASET_TYPE:
1412 yield key.name, self._xgraph.nodes[key]["instance"]
1413 return
1415 def group_by_dimensions(
1416 self, prerequisites: bool = False
1417 ) -> dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]:
1418 """Group this graph's tasks and dataset types by their dimensions.
1420 Parameters
1421 ----------
1422 prerequisites : `bool`, optional
1423 If `True`, include prerequisite dataset types as well as regular
1424 input and output datasets (including intermediates).
1426 Returns
1427 -------
1428 groups : `dict` [ `DimensionGroup`, `tuple` ]
1429 A dictionary of groups keyed by `DimensionGroup`, in which each
1430 value is a tuple of:
1432 - a `dict` of `TaskNode` instances, keyed by task label
1433 - a `dict` of `DatasetTypeNode` instances, keyed by
1434 dataset type name.
1436 that have those dimensions.
1438 Notes
1439 -----
1440 Init inputs and outputs are always included, but always have empty
1441 dimensions and are hence are all grouped together.
1442 """
1443 result: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {}
1444 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {})
1445 for task_label, task_node in self.tasks.items():
1446 if task_node.dimensions is None:
1447 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.")
1448 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value:
1449 next_new_value = ({}, {}) # make new lists for next time
1450 group[0][task_node.label] = task_node
1451 for dataset_type_name, dataset_type_node in self.dataset_types.items():
1452 if dataset_type_node is None:
1453 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.")
1454 if not dataset_type_node.is_prerequisite or prerequisites:
1455 if (
1456 group := result.setdefault(
1457 dataset_type_node.dataset_type.dimensions.as_group(), next_new_value
1458 )
1459 ) is next_new_value:
1460 next_new_value = ({}, {}) # make new lists for next time
1461 group[1][dataset_type_node.name] = dataset_type_node
1462 return result
1464 def split_independent(self) -> Iterable[PipelineGraph]:
1465 """Iterate over independent subgraphs that together comprise this
1466 pipeline graph.
1468 Returns
1469 -------
1470 subgraphs : `Iterable` [ `PipelineGraph` ]
1471 An iterable over component subgraphs that could be run
1472 independently (they have only overall inputs in common). May be a
1473 lazy iterator.
1475 Notes
1476 -----
1477 All resolved dataset type nodes will be preserved.
1479 If there is only one component, ``self`` may be returned as the only
1480 element in the iterable.
1482 If `has_been_sorted`, all subgraphs will be sorted as well.
1483 """
1484 # Having an overall input in common isn't enough to make subgraphs
1485 # dependent on each other, so we want to look for connected component
1486 # subgraphs of the task-only projected graph.
1487 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False)
1488 task_keys = {
1489 key
1490 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1491 if bipartite == NodeType.TASK.bipartite
1492 }
1493 task_xgraph = networkx.algorithms.bipartite.projected_graph(
1494 networkx.DiGraph(bipartite_xgraph), task_keys
1495 )
1496 # "Weakly" connected means connected in only one direction, which is
1497 # the only kind of "connected" a DAG can ever be.
1498 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph):
1499 if component_task_keys == task_keys:
1500 yield self
1501 return
1502 else:
1503 component_subgraph = PipelineGraph(universe=self._universe)
1504 component_subgraph.add_task_nodes(
1505 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self
1506 )
1507 if self.has_been_sorted:
1508 component_subgraph.sort()
1509 yield component_subgraph
1511 ###########################################################################
1512 #
1513 # Class- and Package-Private Methods.
1514 #
1515 ###########################################################################
1517 def _iter_task_defs(self) -> Iterator[TaskDef]:
1518 """Iterate over this pipeline as a sequence of `TaskDef` instances.
1520 Notes
1521 -----
1522 This is a package-private method intended to aid in the transition to a
1523 codebase more fully integrated with the `PipelineGraph` class, in which
1524 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and
1525 much of the functionality on the `Pipeline` class will be moved to
1526 `PipelineGraph` as well.
1528 Raises
1529 ------
1530 TaskNotImportedError
1531 Raised if `TaskNode.is_imported` is `False` for any task.
1532 """
1533 from ..pipeline import TaskDef
1535 for node in self._tasks.values():
1536 yield TaskDef(
1537 config=node.config,
1538 taskClass=node.task_class,
1539 label=node.label,
1540 connections=node.get_connections(),
1541 )
1543 def _init_from_args(
1544 self,
1545 xgraph: networkx.MultiDiGraph | None,
1546 sorted_keys: Sequence[NodeKey] | None,
1547 task_subsets: dict[str, TaskSubset] | None,
1548 description: str,
1549 universe: DimensionUniverse | None,
1550 data_id: DataId | None,
1551 ) -> None:
1552 """Initialize the graph with possibly-nontrivial arguments.
1554 Parameters
1555 ----------
1556 xgraph : `networkx.MultiDiGraph` or `None`
1557 The backing networkx graph, or `None` to create an empty one.
1558 This graph has `NodeKey` instances for nodes and the same structure
1559 as the graph exported by `make_xgraph`, but its nodes and edges
1560 have a single ``instance`` attribute that holds a `TaskNode`,
1561 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or
1562 `WriteEdge` instance.
1563 sorted_keys : `Sequence` [ `NodeKey` ] or `None`
1564 Topologically sorted sequence of node keys, or `None` if the graph
1565 is not sorted.
1566 task_subsets : `dict` [ `str`, `TaskSubset` ]
1567 Labeled subsets of tasks. Values must be constructed with
1568 ``xgraph`` as their parent graph.
1569 description : `str`
1570 String description for this pipeline.
1571 universe : `lsst.daf.butler.DimensionUniverse` or `None`
1572 Definitions of all dimensions.
1573 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping.
1574 Data ID that represents a constraint on all quanta generated from
1575 this pipeline.
1577 Notes
1578 -----
1579 Only empty `PipelineGraph` instances should be constructed directly by
1580 users, which sets the signature of ``__init__`` itself, but methods on
1581 `PipelineGraph` and its helper classes need to be able to create them
1582 with state. Those methods can call this after calling ``__new__``
1583 manually, skipping ``__init__``.
1584 """
1585 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph()
1586 self._sorted_keys: Sequence[NodeKey] | None = None
1587 self._task_subsets = task_subsets if task_subsets is not None else {}
1588 self._description = description
1589 self._tasks = TaskMappingView(self._xgraph)
1590 self._dataset_types = DatasetTypeMappingView(self._xgraph)
1591 self._raw_data_id: dict[str, Any]
1592 if isinstance(data_id, DataCoordinate):
1593 if universe is None:
1594 universe = data_id.universe
1595 else:
1596 assert universe is data_id.universe, "data_id.universe and given universe differ"
1597 self._raw_data_id = dict(data_id.required)
1598 elif data_id is None:
1599 self._raw_data_id = {}
1600 else:
1601 self._raw_data_id = dict(data_id)
1602 self._universe = universe
1603 if sorted_keys is not None:
1604 self._reorder(sorted_keys)
1606 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph:
1607 """Make a bipartite init-only or runtime-only internal subgraph.
1609 See `make_bipartite_xgraph` for parameters and return values.
1611 Notes
1612 -----
1613 This method returns a view of the `PipelineGraph` object's internal
1614 backing graph, and hence should only be called in methods that copy the
1615 result either explicitly or by running a copying algorithm before
1616 returning it to the user.
1617 """
1618 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)])
1620 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G:
1621 """Transform networkx graph attributes in-place from the internal
1622 "instance" attributes to the documented exported attributes.
1624 Parameters
1625 ----------
1626 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1627 Graph whose state should be transformed.
1628 skip_edges : `bool`
1629 If `True`, do not transform edge state.
1631 Returns
1632 -------
1633 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1634 The same object passed in, after modification.
1636 Notes
1637 -----
1638 This should be called after making a copy of the internal graph but
1639 before any projection down to just task or dataset type nodes, since
1640 it assumes stateful edges.
1641 """
1642 state: dict[str, Any]
1643 for state in xgraph.nodes.values():
1644 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance")
1645 if node_value is not None:
1646 state.update(node_value._to_xgraph_state())
1647 else:
1648 # This is a dataset type node that is not resolved.
1649 state["bipartite"] = NodeType.DATASET_TYPE.bipartite
1650 if not skip_edges:
1651 for _, _, state in xgraph.edges(data=True):
1652 edge: Edge | None = state.pop("instance", None)
1653 if edge is not None:
1654 state.update(edge._to_xgraph_state())
1655 return xgraph
1657 def _replace_task_nodes(
1658 self,
1659 updates: Mapping[str, TaskNode],
1660 check_edges_unchanged: bool,
1661 assume_edges_unchanged: bool,
1662 message_header: str,
1663 ) -> None:
1664 """Replace task nodes and update edges and dataset type nodes
1665 accordingly.
1667 Parameters
1668 ----------
1669 updates : `Mapping` [ `str`, `TaskNode` ]
1670 New task nodes with task label keys. All keys must be task labels
1671 that are already present in the graph.
1672 check_edges_unchanged : `bool`, optional
1673 If `True`, require the edges (connections) of the modified tasks to
1674 remain unchanged after importing and configuring each task, and
1675 verify that this is the case.
1676 assume_edges_unchanged : `bool`, optional
1677 If `True`, the caller declares that the edges (connections) of the
1678 modified tasks will remain unchanged importing and configuring each
1679 task, and that it is unnecessary to check this.
1680 message_header : `str`
1681 Template for `str.format` with a single ``task_label`` placeholder
1682 to use as the first line in `EdgesChangedError` messages that show
1683 the differences between new task edges and old task edges. Should
1684 include the fact that the rest of the message will refer to the old
1685 task as "A" and the new task as "B", and end with a colon.
1687 Raises
1688 ------
1689 ValueError
1690 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
1691 are both `True`, or if a full config is provided for a task after
1692 another full config or an override has already been provided.
1693 EdgesChangedError
1694 Raised if ``check_edges_unchanged=True`` and the edges of a task do
1695 change.
1696 """
1697 deep: dict[str, TaskNode] = {}
1698 shallow: dict[str, TaskNode] = {}
1699 if assume_edges_unchanged:
1700 if check_edges_unchanged:
1701 raise ValueError("Cannot simultaneously assume and check that edges have not changed.")
1702 shallow.update(updates)
1703 else:
1704 for task_label, new_task_node in updates.items():
1705 old_task_node = self.tasks[task_label]
1706 messages = old_task_node.diff_edges(new_task_node)
1707 if messages:
1708 if check_edges_unchanged:
1709 messages.insert(0, message_header.format(task_label=task_label))
1710 raise EdgesChangedError("\n".join(messages))
1711 else:
1712 deep[task_label] = new_task_node
1713 else:
1714 shallow[task_label] = new_task_node
1715 try:
1716 if deep:
1717 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True)
1718 self.add_task_nodes(deep.values())
1719 for replaced_task_node, referencing_subsets in removed:
1720 for subset_label in referencing_subsets:
1721 self._task_subsets[subset_label].add(replaced_task_node.label)
1722 for task_node in shallow.values():
1723 self._xgraph.nodes[task_node.key]["instance"] = task_node
1724 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init
1725 except PipelineGraphExceptionSafetyError: # pragma: no cover
1726 raise
1727 except Exception as err: # pragma: no cover
1728 # There's no known way to get here, but we want to make it clear
1729 # it's a big problem if we do.
1730 raise PipelineGraphExceptionSafetyError(
1731 "Error while replacing tasks has left the graph in an inconsistent state."
1732 ) from err
1734 def _append_graph_data_from_edge(
1735 self,
1736 node_data: list[tuple[NodeKey, dict[str, Any]]],
1737 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]],
1738 edge: Edge,
1739 parent: PipelineGraph | None,
1740 ) -> None:
1741 """Append networkx state dictionaries for an edge and the corresponding
1742 dataset type node.
1744 Parameters
1745 ----------
1746 node_data : `list`
1747 List of node keys and state dictionaries. A node is appended if
1748 one does not already exist for this dataset type.
1749 edge_data : `list`
1750 List of node key pairs, connection names, and state dictionaries
1751 for edges.
1752 edge : `Edge`
1753 New edge being processed.
1754 parent : `PipelineGraph` or `None`
1755 Another pipeline graph whose dataset type nodes should be used
1756 when present.
1757 """
1758 new_dataset_type_node = None
1759 if parent is not None:
1760 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance")
1761 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None:
1762 existing_dataset_type_state["instance"] = new_dataset_type_node
1763 else:
1764 node_data.append(
1765 (
1766 edge.dataset_type_key,
1767 {
1768 "instance": new_dataset_type_node,
1769 "bipartite": NodeType.DATASET_TYPE.bipartite,
1770 },
1771 )
1772 )
1773 edge_data.append(
1774 edge.nodes
1775 + (
1776 edge.connection_name,
1777 {"instance": edge},
1778 )
1779 )
1781 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None:
1782 """Set the order of all views of this graph from the given sorted
1783 sequence of task labels and dataset type names.
1784 """
1785 self._sorted_keys = sorted_keys
1786 self._tasks._reorder(sorted_keys)
1787 self._dataset_types._reorder(sorted_keys)
1789 def _reset(self) -> None:
1790 """Reset the all views of this graph following a modification that
1791 might invalidate them.
1792 """
1793 self._sorted_keys = None
1794 self._tasks._reset()
1795 self._dataset_types._reset()
1797 _xgraph: networkx.MultiDiGraph
1798 _sorted_keys: Sequence[NodeKey] | None
1799 _task_subsets: dict[str, TaskSubset]
1800 _description: str
1801 _tasks: TaskMappingView
1802 _dataset_types: DatasetTypeMappingView
1803 _raw_data_id: dict[str, Any]
1804 _universe: DimensionUniverse | None