Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 20%
377 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-06 04:05 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-06 04:05 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("PipelineGraph",)
31import gzip
32import itertools
33import json
34from collections.abc import Iterable, Iterator, Mapping, Sequence
35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast
37import networkx
38import networkx.algorithms.bipartite
39import networkx.algorithms.dag
40from lsst.daf.butler import DataCoordinate, DataId, DimensionGroup, DimensionUniverse, Registry
41from lsst.resources import ResourcePath, ResourcePathExpression
43from ._dataset_types import DatasetTypeNode
44from ._edges import Edge, ReadEdge, WriteEdge
45from ._exceptions import (
46 DuplicateOutputError,
47 EdgesChangedError,
48 PipelineDataCycleError,
49 PipelineGraphError,
50 PipelineGraphExceptionSafetyError,
51 UnresolvedGraphError,
52)
53from ._mapping_views import DatasetTypeMappingView, TaskMappingView
54from ._nodes import NodeKey, NodeType
55from ._task_subsets import TaskSubset
56from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData
58if TYPE_CHECKING:
59 from ..config import PipelineTaskConfig
60 from ..connections import PipelineTaskConnections
61 from ..pipeline import TaskDef
62 from ..pipelineTask import PipelineTask
65_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph)
68class PipelineGraph:
69 """A graph representation of fully-configured pipeline.
71 `PipelineGraph` instances are typically constructed by calling
72 `.Pipeline.to_graph`, but in rare cases constructing and then populating an
73 empty one may be preferable.
75 Parameters
76 ----------
77 description : `str`, optional
78 String description for this pipeline.
79 universe : `lsst.daf.butler.DimensionUniverse`, optional
80 Definitions for all butler dimensions. If not provided, some
81 attributes will not be available until `resolve` is called.
82 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional
83 Data ID that represents a constraint on all quanta generated by this
84 pipeline. This typically just holds the instrument constraint included
85 in the pipeline definition, if there was one.
86 """
88 ###########################################################################
89 #
90 # Simple Pipeline Graph Inspection Interface:
91 #
92 # - for inspecting graph structure, not modifying it (except to sort and]
93 # resolve);
94 #
95 # - no NodeKey objects, just string dataset type name and task label keys;
96 #
97 # - graph structure is represented as a pair of mappings, with methods to
98 # find neighbors and edges of nodes.
99 #
100 ###########################################################################
102 def __init__(
103 self,
104 *,
105 description: str = "",
106 universe: DimensionUniverse | None = None,
107 data_id: DataId | None = None,
108 ) -> None:
109 self._init_from_args(
110 xgraph=None,
111 sorted_keys=None,
112 task_subsets=None,
113 description=description,
114 universe=universe,
115 data_id=data_id,
116 )
118 def __repr__(self) -> str:
119 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})"
121 @property
122 def description(self) -> str:
123 """String description for this pipeline."""
124 return self._description
126 @description.setter
127 def description(self, value: str) -> None:
128 # Docstring in setter.
129 self._description = value
131 @property
132 def universe(self) -> DimensionUniverse | None:
133 """Definitions for all butler dimensions."""
134 return self._universe
136 @property
137 def data_id(self) -> DataCoordinate:
138 """Data ID that represents a constraint on all quanta generated from
139 this pipeline.
141 This is may not be available unless `universe` is not `None`.
142 """
143 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe)
145 @property
146 def tasks(self) -> TaskMappingView:
147 """A mapping view of the tasks in the graph.
149 This mapping has `str` task label keys and `TaskNode` values. Iteration
150 is topologically and deterministically ordered if and only if `sort`
151 has been called since the last modification to the graph.
152 """
153 return self._tasks
155 @property
156 def dataset_types(self) -> DatasetTypeMappingView:
157 """A mapping view of the dataset types in the graph.
159 This mapping has `str` parent dataset type name keys, but only provides
160 access to its `DatasetTypeNode` values if `resolve` has been called
161 since the last modification involving a task that uses a dataset type.
162 See `DatasetTypeMappingView` for details.
163 """
164 return self._dataset_types
166 @property
167 def task_subsets(self) -> Mapping[str, TaskSubset]:
168 """A mapping of all labeled subsets of tasks.
170 Keys are subset labels, values are sets of task labels. See
171 `TaskSubset` for more information.
173 Use `add_task_subset` to add a new subset. The subsets themselves may
174 be modified in-place.
175 """
176 return self._task_subsets
178 @property
179 def is_fully_resolved(self) -> bool:
180 """Whether all of this graph's nodes are resolved."""
181 return self._universe is not None and all(
182 self.dataset_types.is_resolved(k) for k in self.dataset_types
183 )
185 @property
186 def is_sorted(self) -> bool:
187 """Whether this graph's tasks and dataset types are topologically
188 sorted with the exact same deterministic tiebreakers that `sort` would
189 apply.
191 This may perform (and then discard) a full sort if `has_been_sorted` is
192 `False`. If the goal is to obtain a sorted graph, it is better to just
193 call `sort` without guarding that with an ``if not graph.is_sorted``
194 check.
195 """
196 if self._sorted_keys is not None:
197 return True
198 return all(
199 sorted == unsorted
200 for sorted, unsorted in zip(
201 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True
202 )
203 )
205 @property
206 def has_been_sorted(self) -> bool:
207 """Whether this graph's tasks and dataset types have been
208 topologically sorted (with unspecified but deterministic tiebreakers)
209 since the last modification to the graph.
211 This may return `False` if the graph *happens* to be sorted but `sort`
212 was never called, but it is potentially much faster than `is_sorted`,
213 which may attempt (and then discard) a full sort if `has_been_sorted`
214 is `False`.
215 """
216 return self._sorted_keys is not None
218 def sort(self) -> None:
219 """Sort this graph's nodes topologically with deterministic (but
220 unspecified) tiebreakers.
222 This does nothing if the graph is already known to be sorted.
223 """
224 if self._sorted_keys is None:
225 try:
226 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph))
227 except networkx.NetworkXUnfeasible as err: # pragma: no cover
228 # Should't be possible to get here, because we check for cycles
229 # when adding tasks, but we guard against it anyway.
230 cycle = networkx.find_cycle(self._xgraph)
231 raise PipelineDataCycleError(
232 f"Cycle detected while attempting to sort graph: {cycle}."
233 ) from err
234 self._reorder(sorted_keys)
236 def copy(self) -> PipelineGraph:
237 """Return a copy of this graph that copies all mutable state."""
238 xgraph = self._xgraph.copy()
239 result = PipelineGraph.__new__(PipelineGraph)
240 result._init_from_args(
241 xgraph,
242 self._sorted_keys,
243 task_subsets={
244 k: TaskSubset(xgraph, v.label, set(v._members), v.description)
245 for k, v in self._task_subsets.items()
246 },
247 description=self._description,
248 universe=self.universe,
249 data_id=self._raw_data_id,
250 )
251 return result
253 def __copy__(self) -> PipelineGraph:
254 # Fully shallow copies are dangerous; we don't want shared mutable
255 # state to lead to broken class invariants.
256 return self.copy()
258 def __deepcopy__(self, memo: dict) -> PipelineGraph:
259 # Genuine deep copies are unnecessary, since we should only ever care
260 # that mutable state is copied.
261 return self.copy()
263 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None:
264 """Return the `WriteEdge` that links the producing task to the named
265 dataset type.
267 Parameters
268 ----------
269 dataset_type_name : `str`
270 Dataset type name. Must not be a component.
272 Returns
273 -------
274 edge : `WriteEdge` or `None`
275 Producing edge or `None` if there isn't one in this graph.
277 Raises
278 ------
279 DuplicateOutputError
280 Raised if there are multiple tasks defined to produce this dataset
281 type. This is only possible if the graph's dataset types are not
282 resolved.
284 Notes
285 -----
286 On resolved graphs, it may be slightly more efficient to use::
288 graph.dataset_types[dataset_type_name].producing_edge
290 but this method works on graphs with unresolved dataset types as well.
291 """
292 producer: str | None = None
293 producing_edge: WriteEdge | None = None
294 for _, _, producing_edge in self._xgraph.in_edges(
295 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
296 ):
297 assert producing_edge is not None, "Should only be None if we never loop."
298 if producer is not None:
299 raise DuplicateOutputError(
300 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} "
301 f"and {producer!r}."
302 )
303 return producing_edge
305 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]:
306 """Return the `ReadEdge` objects that link the named dataset type to
307 the tasks that consume it.
309 Parameters
310 ----------
311 dataset_type_name : `str`
312 Dataset type name. Must not be a component.
314 Returns
315 -------
316 edges : `list` [ `ReadEdge` ]
317 Edges that connect this dataset type to the tasks that consume it.
319 Notes
320 -----
321 On resolved graphs, it may be slightly more efficient to use::
323 graph.dataset_types[dataset_type_name].producing_edges
325 but this method works on graphs with unresolved dataset types as well.
326 """
327 return [
328 edge
329 for _, _, edge in self._xgraph.out_edges(
330 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
331 )
332 ]
334 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None:
335 """Return the `TaskNode` or `TaskInitNode` that writes the given
336 dataset type.
338 Parameters
339 ----------
340 dataset_type_name : `str`
341 Dataset type name. Must not be a component.
343 Returns
344 -------
345 edge : `TaskNode`, `TaskInitNode`, or `None`
346 Producing node or `None` if there isn't one in this graph.
348 Raises
349 ------
350 DuplicateOutputError
351 Raised if there are multiple tasks defined to produce this dataset
352 type. This is only possible if the graph's dataset types are not
353 resolved.
354 """
355 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None:
356 return self._xgraph.nodes[producing_edge.task_key]["instance"]
357 return None
359 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]:
360 """Return the `TaskNode` and/or `TaskInitNode` objects that read
361 the given dataset type.
363 Parameters
364 ----------
365 dataset_type_name : `str`
366 Dataset type name. Must not be a component.
368 Returns
369 -------
370 edges : `list` [ `ReadEdge` ]
371 Edges that connect this dataset type to the tasks that consume it.
373 Notes
374 -----
375 On resolved graphs, it may be slightly more efficient to use::
377 graph.dataset_types[dataset_type_name].producing_edges
379 but this method works on graphs with unresolved dataset types as well.
380 """
381 return [
382 self._xgraph.nodes[consuming_edge.task_key]["instance"]
383 for consuming_edge in self.consuming_edges_of(dataset_type_name)
384 ]
386 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]:
387 """Return the dataset types that are inputs to a task.
389 Parameters
390 ----------
391 task_label : `str`
392 Label for the task in the pipeline.
393 init : `bool`, optional
394 If `True`, return init-input dataset types instead of runtime
395 (including prerequisite) inputs.
397 Returns
398 -------
399 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
400 Dictionary parent dataset type name keys and either
401 `DatasetTypeNode` values (if the dataset type has been resolved)
402 or `None` values.
404 Notes
405 -----
406 To get the input edges of a task or task init node (which provide
407 information about storage class overrides nd components) use::
409 graph.tasks[task_label].iter_all_inputs()
411 or
413 graph.tasks[task_label].init.iter_all_inputs()
415 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
416 class.
417 """
418 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
419 return {
420 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
421 for edge in node.iter_all_inputs()
422 }
424 def outputs_of(
425 self, task_label: str, init: bool = False, include_automatic_connections: bool = True
426 ) -> dict[str, DatasetTypeNode | None]:
427 """Return the dataset types that are outputs of a task.
429 Parameters
430 ----------
431 task_label : `str`
432 Label for the task in the pipeline.
433 init : `bool`, optional
434 If `True`, return init-output dataset types instead of runtime
435 outputs.
436 include_automatic_connections : `bool`, optional
437 Whether to include automatic connections such as configs, metadata,
438 and logs.
440 Returns
441 -------
442 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
443 Dictionary parent dataset type name keys and either
444 `DatasetTypeNode` values (if the dataset type has been resolved)
445 or `None` values.
447 Notes
448 -----
449 To get the input edges of a task or task init node (which provide
450 information about storage class overrides nd components) use::
452 graph.tasks[task_label].iter_all_outputs()
454 or
456 graph.tasks[task_label].init.iter_all_outputs()
458 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
459 class.
460 """
461 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
462 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values()
463 return {
464 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
465 for edge in iterable
466 }
468 def resolve(self, registry: Registry) -> None:
469 """Resolve all dimensions and dataset types and check them for
470 consistency.
472 Resolving a graph also causes it to be sorted.
474 Parameters
475 ----------
476 registry : `lsst.daf.butler.Registry`
477 Client for the data repository to resolve against.
479 Notes
480 -----
481 The `universe` attribute is set to ``registry.dimensions`` and used to
482 set all `TaskNode.dimensions` attributes. Dataset type nodes are
483 resolved by first looking for a registry definition, then using the
484 producing task's definition, then looking for consistency between all
485 consuming task definitions.
487 Raises
488 ------
489 ConnectionTypeConsistencyError
490 Raised if a prerequisite input for one task appears as a different
491 kind of connection in any other task.
492 DuplicateOutputError
493 Raised if multiple tasks have the same dataset type as an output.
494 IncompatibleDatasetTypeError
495 Raised if different tasks have different definitions of a dataset
496 type. Different but compatible storage classes are permitted.
497 MissingDatasetTypeError
498 Raised if a dataset type definition is required to exist in the
499 data repository but none was found. This should only occur for
500 dataset types that are not produced by a task in the pipeline and
501 are consumed with different storage classes or as components by
502 tasks in the pipeline.
503 EdgesChangedError
504 Raised if ``check_edges_unchanged=True`` and the edges of a task do
505 change after import and reconfiguration.
506 """
507 node_key: NodeKey
508 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {}
509 for node_key, node_state in self._xgraph.nodes.items():
510 match node_key.node_type:
511 case NodeType.TASK:
512 task_node: TaskNode = node_state["instance"]
513 new_task_node = task_node._resolved(registry.dimensions)
514 if new_task_node is not task_node:
515 updates[node_key] = new_task_node
516 case NodeType.DATASET_TYPE:
517 dataset_type_node: DatasetTypeNode | None = node_state["instance"]
518 new_dataset_type_node = DatasetTypeNode._from_edges(
519 node_key, self._xgraph, registry, previous=dataset_type_node
520 )
521 # Usage of `is`` here is intentional; `_from_edges` returns
522 # `previous=dataset_type_node` if it can determine that it
523 # doesn't need to change.
524 if new_dataset_type_node is not dataset_type_node:
525 updates[node_key] = new_dataset_type_node
526 try:
527 for node_key, node_value in updates.items():
528 self._xgraph.nodes[node_key]["instance"] = node_value
529 except Exception as err: # pragma: no cover
530 # There's no known way to get here, but we want to make it
531 # clear it's a big problem if we do.
532 raise PipelineGraphExceptionSafetyError(
533 "Error during dataset type resolution has left the graph in an inconsistent state."
534 ) from err
535 self.sort()
536 self._universe = registry.dimensions
538 ###########################################################################
539 #
540 # Graph Modification Interface:
541 #
542 # - methods to add, remove, and replace tasks;
543 #
544 # - methods to add and remove task subsets.
545 #
546 # These are all things that are usually done in a Pipeline before making a
547 # graph at all, but there may be cases where we want to modify the graph
548 # instead. (These are also the methods used to make a graph from a
549 # Pipeline, or make a graph from another graph.)
550 #
551 ###########################################################################
553 def add_task(
554 self,
555 label: str,
556 task_class: type[PipelineTask],
557 config: PipelineTaskConfig,
558 connections: PipelineTaskConnections | None = None,
559 ) -> TaskNode:
560 """Add a new task to the graph.
562 Parameters
563 ----------
564 label : `str`
565 Label for the task in the pipeline.
566 task_class : `type` [ `PipelineTask` ]
567 Class object for the task.
568 config : `PipelineTaskConfig`
569 Configuration for the task.
570 connections : `PipelineTaskConnections`, optional
571 Object that describes the dataset types used by the task. If not
572 provided, one will be constructed from the given configuration. If
573 provided, it is assumed that ``config`` has already been validated
574 and frozen.
576 Returns
577 -------
578 node : `TaskNode`
579 The new task node added to the graph.
581 Raises
582 ------
583 ValueError
584 Raised if configuration validation failed when constructing
585 ``connections``.
586 PipelineDataCycleError
587 Raised if the graph is cyclic after this addition.
588 RuntimeError
589 Raised if an unexpected exception (which will be chained) occurred
590 at a stage that may have left the graph in an inconsistent state.
591 Other exceptions should leave the graph unchanged.
593 Notes
594 -----
595 Checks for dataset type consistency and multiple producers do not occur
596 until `resolve` is called, since the resolution depends on both the
597 state of the data repository and all contributing tasks.
599 Adding new tasks removes any existing resolutions of all dataset types
600 it references and marks the graph as unsorted. It is most effiecient
601 to add all tasks up front and only then resolve and/or sort the graph.
602 """
603 task_node = TaskNode._from_imported_data(
604 key=NodeKey(NodeType.TASK, label),
605 init_key=NodeKey(NodeType.TASK_INIT, label),
606 data=_TaskNodeImportedData.configure(label, task_class, config, connections),
607 universe=self.universe,
608 )
609 self.add_task_nodes([task_node])
610 return task_node
612 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None:
613 """Add one or more existing task nodes to the graph.
615 Parameters
616 ----------
617 nodes : `~collections.abc.Iterable` [ `TaskNode` ]
618 Iterable of task nodes to add. If any tasks have resolved
619 dimensions, they must have the same dimension universe as the rest
620 of the graph.
621 parent : `PipelineGraph`, optional
622 If provided, another `PipelineGraph` from which these nodes were
623 obtained. Any dataset type nodes already present in ``parent``
624 that are referenced by the given tasks will be used in this graph
625 if they are not already present, preserving any dataset type
626 resolutions present in the parent graph. Adding nodes from a
627 parent graph after the graph has its own nodes (e.g. from
628 `add_task`) or nodes from a third graph may result in invalid
629 dataset type resolutions. It is safest to only use this argument
630 when populating an empty graph for the first time.
632 Raises
633 ------
634 PipelineDataCycleError
635 Raised if the graph is cyclic after this addition.
637 Notes
638 -----
639 Checks for dataset type consistency and multiple producers do not occur
640 until `resolve` is called, since the resolution depends on both the
641 state of the data repository and all contributing tasks.
643 Adding new tasks removes any existing resolutions of all dataset types
644 it references (unless ``parent is not None`` and marks the graph as
645 unsorted. It is most efficient to add all tasks up front and only then
646 resolve and/or sort the graph.
647 """
648 node_data: list[tuple[NodeKey, dict[str, Any]]] = []
649 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = []
650 for task_node in nodes:
651 task_node = task_node._resolved(self._universe)
652 node_data.append(
653 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite})
654 )
655 node_data.append(
656 (
657 task_node.init.key,
658 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite},
659 )
660 )
661 # Convert the edge objects attached to the task node to networkx.
662 for read_edge in task_node.init.iter_all_inputs():
663 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
664 for write_edge in task_node.init.iter_all_outputs():
665 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
666 for read_edge in task_node.iter_all_inputs():
667 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
668 for write_edge in task_node.iter_all_outputs():
669 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
670 # Add a special edge (with no Edge instance) that connects the
671 # TaskInitNode to the runtime TaskNode.
672 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None}))
673 if not node_data and not edge_data:
674 return
675 # Checks and preparation complete; time to start the actual
676 # modification, during which it's hard to provide strong exception
677 # safety. Start by resetting the sort ordering, if there is one.
678 self._reset()
679 try:
680 self._xgraph.add_nodes_from(node_data)
681 self._xgraph.add_edges_from(edge_data)
682 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph):
683 cycle = networkx.find_cycle(self._xgraph)
684 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.")
685 except Exception:
686 # First try to roll back our changes.
687 try:
688 self._xgraph.remove_edges_from(edge_data)
689 self._xgraph.remove_nodes_from(key for key, _ in node_data)
690 except Exception as err: # pragma: no cover
691 # There's no known way to get here, but we want to make it
692 # clear it's a big problem if we do.
693 raise PipelineGraphExceptionSafetyError(
694 "Error while attempting to revert PipelineGraph modification has left the graph in "
695 "an inconsistent state."
696 ) from err
697 # Successfully rolled back; raise the original exception.
698 raise
700 def reconfigure_tasks(
701 self,
702 *args: tuple[str, PipelineTaskConfig],
703 check_edges_unchanged: bool = False,
704 assume_edges_unchanged: bool = False,
705 **kwargs: PipelineTaskConfig,
706 ) -> None:
707 """Update the configuration for one or more tasks.
709 Parameters
710 ----------
711 *args : `tuple` [ `str`, `.PipelineTaskConfig` ]
712 Positional arguments are each a 2-tuple of task label and new
713 config object. Note that the same arguments may also be passed as
714 ``**kwargs``, which is usually more readable, but task labels in
715 ``*args`` are not required to be valid Python identifiers.
716 check_edges_unchanged : `bool`, optional
717 If `True`, require the edges (connections) of the modified tasks to
718 remain unchanged after the configuration updates, and verify that
719 this is the case.
720 assume_edges_unchanged : `bool`, optional
721 If `True`, the caller declares that the edges (connections) of the
722 modified tasks will remain unchanged after the configuration
723 updates, and that it is unnecessary to check this.
724 **kwargs : `.PipelineTaskConfig`
725 New config objects or overrides to apply to copies of the current
726 config objects, with task labels as the keywords.
728 Raises
729 ------
730 ValueError
731 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
732 are both `True`, or if the same task appears twice.
733 EdgesChangedError
734 Raised if ``check_edges_unchanged=True`` and the edges of a task do
735 change.
737 Notes
738 -----
739 If reconfiguring a task causes its edges to change, any dataset type
740 nodes connected to that task (not just those whose edges have changed!)
741 will be unresolved.
742 """
743 new_configs: dict[str, PipelineTaskConfig] = {}
744 for task_label, config_update in itertools.chain(args, kwargs.items()):
745 if new_configs.setdefault(task_label, config_update) is not config_update:
746 raise ValueError(f"Config for {task_label!r} provided more than once.")
747 updates = {
748 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged)
749 for task_label, config in new_configs.items()
750 }
751 self._replace_task_nodes(
752 updates,
753 check_edges_unchanged=check_edges_unchanged,
754 assume_edges_unchanged=assume_edges_unchanged,
755 message_header=(
756 "Unexpected change in edges for task {task_label!r} from original config (A) to "
757 "new configs (B):"
758 ),
759 )
761 def remove_tasks(
762 self, labels: Iterable[str], drop_from_subsets: bool = True
763 ) -> list[tuple[TaskNode, set[str]]]:
764 """Remove one or more tasks from the graph.
766 Parameters
767 ----------
768 labels : `~collections.abc.Iterable` [ `str` ]
769 Iterable of the labels of the tasks to remove.
770 drop_from_subsets : `bool`, optional
771 If `True`, drop each removed task from any subset in which it
772 currently appears. If `False`, raise `PipelineGraphError` if any
773 such subsets exist.
775 Returns
776 -------
777 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ]
778 List of nodes removed and the labels of task subsets that
779 referenced them.
781 Raises
782 ------
783 PipelineGraphError
784 Raised if ``drop_from_subsets`` is `False` and the task is still
785 part of one or more subsets.
787 Notes
788 -----
789 Removing a task will cause dataset nodes with no other referencing
790 tasks to be removed. Any other dataset type nodes referenced by a
791 removed task will be reset to an "unresolved" state.
792 """
793 task_nodes_and_subsets = []
794 dataset_types: set[NodeKey] = set()
795 nodes_to_remove = set()
796 for label in labels:
797 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"]
798 # Find task subsets that reference this task.
799 referencing_subsets = {
800 subset_label
801 for subset_label, task_subset in self.task_subsets.items()
802 if label in task_subset
803 }
804 if not drop_from_subsets and referencing_subsets:
805 raise PipelineGraphError(
806 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}."
807 )
808 task_nodes_and_subsets.append((task_node, referencing_subsets))
809 # Find dataset types referenced by this task.
810 dataset_types.update(self._xgraph.predecessors(task_node.key))
811 dataset_types.update(self._xgraph.successors(task_node.key))
812 dataset_types.update(self._xgraph.predecessors(task_node.init.key))
813 dataset_types.update(self._xgraph.successors(task_node.init.key))
814 # Since there's an edge between the task and its init node, we'll
815 # have added those two nodes here, too, and we don't want that.
816 dataset_types.remove(task_node.init.key)
817 dataset_types.remove(task_node.key)
818 # Mark the task node and its init node for removal from the graph.
819 nodes_to_remove.add(task_node.key)
820 nodes_to_remove.add(task_node.init.key)
821 # Process the referenced datasets to see which ones are orphaned and
822 # need to be removed vs. just unresolved.
823 nodes_to_unresolve = []
824 for dataset_type_key in dataset_types:
825 related_tasks = set()
826 related_tasks.update(self._xgraph.predecessors(dataset_type_key))
827 related_tasks.update(self._xgraph.successors(dataset_type_key))
828 related_tasks.difference_update(nodes_to_remove)
829 if not related_tasks:
830 nodes_to_remove.add(dataset_type_key)
831 else:
832 nodes_to_unresolve.append(dataset_type_key)
833 # Checks and preparation complete; time to start the actual
834 # modification, during which it's hard to provide strong exception
835 # safety. Start by resetting the sort ordering.
836 self._reset()
837 try:
838 for dataset_type_key in nodes_to_unresolve:
839 self._xgraph.nodes[dataset_type_key]["instance"] = None
840 for task_node, referencing_subsets in task_nodes_and_subsets:
841 for subset_label in referencing_subsets:
842 self._task_subsets[subset_label].remove(task_node.label)
843 self._xgraph.remove_nodes_from(nodes_to_remove)
844 except Exception as err: # pragma: no cover
845 # There's no known way to get here, but we want to make it
846 # clear it's a big problem if we do.
847 raise PipelineGraphExceptionSafetyError(
848 "Error during task removal has left the graph in an inconsistent state."
849 ) from err
850 return task_nodes_and_subsets
852 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None:
853 """Add a label for a set of tasks that are already in the pipeline.
855 Parameters
856 ----------
857 subset_label : `str`
858 Label for this set of tasks.
859 task_labels : `~collections.abc.Iterable` [ `str` ]
860 Labels of the tasks to include in the set. All must already be
861 included in the graph.
862 description : `str`, optional
863 String description to associate with this label.
864 """
865 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description)
866 self._task_subsets[subset_label] = subset
868 def remove_task_subset(self, subset_label: str) -> None:
869 """Remove a labeled set of tasks.
871 Parameters
872 ----------
873 subset_label : `str`
874 Label for this set of tasks.
875 """
876 del self._task_subsets[subset_label]
878 ###########################################################################
879 #
880 # NetworkX Export Interface:
881 #
882 # - methods to export the PipelineGraph's content (or various subsets
883 # thereof) as NetworkX objects.
884 #
885 # These are particularly useful when writing tools to visualize the graph,
886 # while providing options for which aspects of the graph (tasks, dataset
887 # types, or both) to include, since all exported graphs have similar
888 # attributes regardless of their structure.
889 #
890 ###########################################################################
892 def make_xgraph(self) -> networkx.MultiDiGraph:
893 """Export a networkx representation of the full pipeline graph,
894 including both init and runtime edges.
896 Returns
897 -------
898 xgraph : `networkx.MultiDiGraph`
899 Directed acyclic graph with parallel edges.
901 Notes
902 -----
903 The returned graph uses `NodeKey` instances for nodes. Parallel edges
904 represent the same dataset type appearing in multiple connections for
905 the same task, and are hence rare. The connection name is used as the
906 edge key to disambiguate those parallel edges.
908 Almost all edges connect dataset type nodes to task or task init nodes
909 or vice versa, but there is also a special edge that connects each task
910 init node to its runtime node. The existence of these edges makes the
911 graph not quite bipartite, though its init-only and runtime-only
912 subgraphs are bipartite.
914 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
915 `WriteEdge` for the descriptive node and edge attributes added.
916 """
917 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False)
919 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph:
920 """Return a bipartite networkx representation of just the runtime or
921 init-time pipeline graph.
923 Parameters
924 ----------
925 init : `bool`, optional
926 If `True` (`False` is default) return the graph of task
927 initialization nodes and init input/output dataset types, instead
928 of the graph of runtime task nodes and regular
929 input/output/prerequisite dataset types.
931 Returns
932 -------
933 xgraph : `networkx.MultiDiGraph`
934 Directed acyclic graph with parallel edges.
936 Notes
937 -----
938 The returned graph uses `NodeKey` instances for nodes. Parallel edges
939 represent the same dataset type appearing in multiple connections for
940 the same task, and are hence rare. The connection name is used as the
941 edge key to disambiguate those parallel edges.
943 This graph is bipartite because each dataset type node only has edges
944 that connect it to a task [init] node, and vice versa.
946 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
947 `WriteEdge` for the descriptive node and edge attributes added.
948 """
949 return self._transform_xgraph_state(
950 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False
951 )
953 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph:
954 """Return a networkx representation of just the tasks in the pipeline.
956 Parameters
957 ----------
958 init : `bool`, optional
959 If `True` (`False` is default) return the graph of task
960 initialization nodes, instead of the graph of runtime task nodes.
962 Returns
963 -------
964 xgraph : `networkx.DiGraph`
965 Directed acyclic graph with no parallel edges.
967 Notes
968 -----
969 The returned graph uses `NodeKey` instances for nodes. The dataset
970 types that link these tasks are not represented at all; edges have no
971 attributes, and there are no parallel edges.
973 See `TaskNode` and `TaskInitNode` for the descriptive node and
974 attributes added.
975 """
976 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
977 task_keys = [
978 key
979 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
980 if bipartite == NodeType.TASK.bipartite
981 ]
982 return self._transform_xgraph_state(
983 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys),
984 skip_edges=True,
985 )
987 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph:
988 """Return a networkx representation of just the dataset types in the
989 pipeline.
991 Parameters
992 ----------
993 init : `bool`, optional
994 If `True` (`False` is default) return the graph of init input and
995 output dataset types, instead of the graph of runtime (input,
996 output, prerequisite input) dataset types.
998 Returns
999 -------
1000 xgraph : `networkx.DiGraph`
1001 Directed acyclic graph with no parallel edges.
1003 Notes
1004 -----
1005 The returned graph uses `NodeKey` instances for nodes. The tasks that
1006 link these tasks are not represented at all; edges have no attributes,
1007 and there are no parallel edges.
1009 See `DatasetTypeNode` for the descriptive node and attributes added.
1010 """
1011 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
1012 dataset_type_keys = [
1013 key
1014 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1015 if bipartite == NodeType.DATASET_TYPE.bipartite
1016 ]
1017 return self._transform_xgraph_state(
1018 networkx.algorithms.bipartite.projected_graph(
1019 networkx.DiGraph(bipartite_xgraph), dataset_type_keys
1020 ),
1021 skip_edges=True,
1022 )
1024 ###########################################################################
1025 #
1026 # Serialization Interface.
1027 #
1028 # Serialization of PipelineGraphs is currently experimental and may not be
1029 # retained in the future. All serialization methods are
1030 # underscore-prefixed to ensure nobody mistakes them for a stable interface
1031 # (let a lone a stable file format).
1032 #
1033 ###########################################################################
1035 @classmethod
1036 def _read_stream(
1037 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1038 ) -> PipelineGraph:
1039 """Read a serialized `PipelineGraph` from a file-like object.
1041 Parameters
1042 ----------
1043 stream : `BinaryIO`
1044 File-like object opened for binary reading, containing
1045 gzip-compressed JSON.
1046 import_mode : `TaskImportMode`, optional
1047 Whether to import tasks, and how to reconcile any differences
1048 between the imported task's connections and the those that were
1049 persisted with the graph. Default is to check that they are the
1050 same.
1052 Returns
1053 -------
1054 graph : `PipelineGraph`
1055 Deserialized pipeline graph.
1057 Raises
1058 ------
1059 PipelineGraphReadError
1060 Raised if the serialized `PipelineGraph` is not self-consistent.
1061 EdgesChangedError
1062 Raised if ``import_mode`` is
1063 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1064 did change after import and reconfiguration.
1066 Notes
1067 -----
1068 `PipelineGraph` serialization is currently experimental and may be
1069 removed or significantly changed in the future, with no deprecation
1070 period.
1071 """
1072 from .io import SerializedPipelineGraph
1074 with gzip.open(stream, "rb") as uncompressed_stream:
1075 data = json.load(uncompressed_stream)
1076 serialized_graph = SerializedPipelineGraph.model_validate(data)
1077 return serialized_graph.deserialize(import_mode)
1079 @classmethod
1080 def _read_uri(
1081 cls,
1082 uri: ResourcePathExpression,
1083 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES,
1084 ) -> PipelineGraph:
1085 """Read a serialized `PipelineGraph` from a file at a URI.
1087 Parameters
1088 ----------
1089 uri : convertible to `lsst.resources.ResourcePath`
1090 URI to a gzip-compressed JSON file containing a serialized pipeline
1091 graph.
1092 import_mode : `TaskImportMode`, optional
1093 Whether to import tasks, and how to reconcile any differences
1094 between the imported task's connections and the those that were
1095 persisted with the graph. Default is to check that they are the
1096 same.
1098 Returns
1099 -------
1100 graph : `PipelineGraph`
1101 Deserialized pipeline graph.
1103 Raises
1104 ------
1105 PipelineGraphReadError
1106 Raised if the serialized `PipelineGraph` is not self-consistent.
1107 EdgesChangedError
1108 Raised if ``import_mode`` is
1109 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1110 did change after import and reconfiguration.
1112 Notes
1113 -----
1114 `PipelineGraph` serialization is currently experimental and may be
1115 removed or significantly changed in the future, with no deprecation
1116 period.
1117 """
1118 uri = ResourcePath(uri)
1119 with uri.open("rb") as stream:
1120 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode)
1122 def _write_stream(self, stream: BinaryIO) -> None:
1123 """Write the pipeline to a file-like object.
1125 Parameters
1126 ----------
1127 stream
1128 File-like object opened for binary writing.
1130 Notes
1131 -----
1132 `PipelineGraph` serialization is currently experimental and may be
1133 removed or significantly changed in the future, with no deprecation
1134 period.
1136 The file format is gzipped JSON, and is intended to be human-readable,
1137 but it should not be considered a stable public interface for outside
1138 code, which should always use `PipelineGraph` methods (or at least the
1139 `io.SerializedPipelineGraph` class) to read these files.
1140 """
1141 from .io import SerializedPipelineGraph
1143 with gzip.open(stream, mode="wb") as compressed_stream:
1144 compressed_stream.write(
1145 SerializedPipelineGraph.serialize(self).model_dump_json(exclude_defaults=True).encode("utf-8")
1146 )
1148 def _write_uri(self, uri: ResourcePathExpression) -> None:
1149 """Write the pipeline to a file given a URI.
1151 Parameters
1152 ----------
1153 uri : convertible to `lsst.resources.ResourcePath`
1154 URI to write to . May have ``.json.gz`` or no extension (which
1155 will cause a ``.json.gz`` extension to be added).
1157 Notes
1158 -----
1159 `PipelineGraph` serialization is currently experimental and may be
1160 removed or significantly changed in the future, with no deprecation
1161 period.
1163 The file format is gzipped JSON, and is intended to be human-readable,
1164 but it should not be considered a stable public interface for outside
1165 code, which should always use `PipelineGraph` methods (or at least the
1166 `io.SerializedPipelineGraph` class) to read these files.
1167 """
1168 uri = ResourcePath(uri)
1169 extension = uri.getExtension()
1170 if not extension:
1171 uri = uri.updatedExtension(".json.gz")
1172 elif extension != ".json.gz":
1173 raise ValueError("Expanded pipeline files should always have a .json.gz extension.")
1174 with uri.open(mode="wb") as stream:
1175 self._write_stream(cast(BinaryIO, stream))
1177 def _import_and_configure(
1178 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1179 ) -> None:
1180 """Import the `PipelineTask` classes referenced by all task nodes and
1181 update those nodes accordingly.
1183 Parameters
1184 ----------
1185 import_mode : `TaskImportMode`, optional
1186 Whether to import tasks, and how to reconcile any differences
1187 between the imported task's connections and the those that were
1188 persisted with the graph. Default is to check that they are the
1189 same. This method does nothing if this is
1190 `TaskImportMode.DO_NOT_IMPORT`.
1192 Raises
1193 ------
1194 EdgesChangedError
1195 Raised if ``import_mode`` is
1196 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1197 did change after import and reconfiguration.
1199 Notes
1200 -----
1201 This method shouldn't need to be called unless the graph was
1202 deserialized without importing and configuring immediately, which is
1203 not the default behavior (but it can greatly speed up deserialization).
1204 If all tasks have already been imported this does nothing.
1206 Importing and configuring a task can change its
1207 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output,
1208 usually because the software used to read a serialized graph is newer
1209 than the software used to write it (e.g. a new config option has been
1210 added, or the task was moved to a new module with a forwarding alias
1211 left behind). These changes are allowed by
1212 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`.
1214 If importing and configuring a task causes its edges to change, any
1215 dataset type nodes linked to those edges will be reset to the
1216 unresolved state.
1217 """
1218 if import_mode is TaskImportMode.DO_NOT_IMPORT:
1219 return
1220 rebuild = (
1221 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES
1222 or import_mode is TaskImportMode.OVERRIDE_EDGES
1223 )
1224 updates: dict[str, TaskNode] = {}
1225 node_key: NodeKey
1226 for node_key, node_state in self._xgraph.nodes.items():
1227 if node_key.node_type is NodeType.TASK:
1228 task_node: TaskNode = node_state["instance"]
1229 new_task_node = task_node._imported_and_configured(rebuild)
1230 if new_task_node is not task_node:
1231 updates[task_node.label] = new_task_node
1232 self._replace_task_nodes(
1233 updates,
1234 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES),
1235 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES),
1236 message_header=(
1237 "In task with label {task_label!r}, persisted edges (A)"
1238 "differ from imported and configured edges (B):"
1239 ),
1240 )
1242 ###########################################################################
1243 #
1244 # Advanced PipelineGraph Inspection Interface:
1245 #
1246 # - methods to iterate over all nodes and edges, utilizing NodeKeys;
1247 #
1248 # - methods to find overall inputs and group nodes by their dimensions,
1249 # which are important operations for QuantumGraph generation.
1250 #
1251 ###########################################################################
1253 def iter_edges(self, init: bool = False) -> Iterator[Edge]:
1254 """Iterate over edges in the graph.
1256 Parameters
1257 ----------
1258 init : `bool`, optional
1259 If `True` (`False` is default) iterate over the edges between task
1260 initialization node and init input/output dataset types, instead of
1261 the runtime task nodes and regular input/output/prerequisite
1262 dataset types.
1264 Returns
1265 -------
1266 edges : `~collections.abc.Iterator` [ `Edge` ]
1267 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances.
1269 Notes
1270 -----
1271 This method always returns _either_ init edges or runtime edges, never
1272 both. The full (internal) graph that contains both also includes a
1273 special edge that connects each task init node to its runtime node;
1274 that is also never returned by this method, since it is never a part of
1275 the init-only or runtime-only subgraphs.
1276 """
1277 edge: Edge
1278 for _, _, edge in self._xgraph.edges(data="instance"):
1279 if edge is not None and edge.is_init == init:
1280 yield edge
1282 def iter_nodes(
1283 self,
1284 ) -> Iterator[
1285 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode]
1286 | tuple[Literal[NodeType.TASK], str, TaskInitNode]
1287 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None]
1288 ]:
1289 """Iterate over nodes in the graph.
1291 Returns
1292 -------
1293 nodes : `~collections.abc.Iterator` [ `tuple` ]
1294 A lazy iterator over all of the nodes in the graph. Each yielded
1295 element is a tuple of:
1297 - the node type enum value (`NodeType`);
1298 - the string name for the node (task label or parent dataset type
1299 name);
1300 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`,
1301 or `None` for dataset type nodes that have not been resolved).
1302 """
1303 key: NodeKey
1304 if self._sorted_keys is not None:
1305 for key in self._sorted_keys:
1306 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore
1307 else:
1308 for key, node in self._xgraph.nodes(data="instance"):
1309 yield key.node_type, key.name, node # type: ignore
1311 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]:
1312 """Iterate over all of the dataset types that are consumed but not
1313 produced by the graph.
1315 Returns
1316 -------
1317 dataset_types : `~collections.abc.Iterator` [ `tuple` ]
1318 A lazy iterator over the overall-input dataset types (including
1319 overall init inputs and prerequisites). Each yielded element is a
1320 tuple of:
1322 - the parent dataset type name;
1323 - the resolved `DatasetTypeNode`, or `None` if the dataset type has
1324 - not been resolved.
1325 """
1326 for generation in networkx.algorithms.dag.topological_generations(self._xgraph):
1327 key: NodeKey
1328 for key in generation:
1329 # While we expect all tasks to have at least one input and
1330 # hence never appear in the first topological generation, that
1331 # is not true of task init nodes.
1332 if key.node_type is NodeType.DATASET_TYPE:
1333 yield key.name, self._xgraph.nodes[key]["instance"]
1334 return
1336 def group_by_dimensions(
1337 self, prerequisites: bool = False
1338 ) -> dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]:
1339 """Group this graph's tasks and dataset types by their dimensions.
1341 Parameters
1342 ----------
1343 prerequisites : `bool`, optional
1344 If `True`, include prerequisite dataset types as well as regular
1345 input and output datasets (including intermediates).
1347 Returns
1348 -------
1349 groups : `dict` [ `DimensionGroup`, `tuple` ]
1350 A dictionary of groups keyed by `DimensionGroup`, in which each
1351 value is a tuple of:
1353 - a `dict` of `TaskNode` instances, keyed by task label
1354 - a `dict` of `DatasetTypeNode` instances, keyed by
1355 dataset type name.
1357 that have those dimensions.
1359 Notes
1360 -----
1361 Init inputs and outputs are always included, but always have empty
1362 dimensions and are hence are all grouped together.
1363 """
1364 result: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {}
1365 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {})
1366 for task_label, task_node in self.tasks.items():
1367 if task_node.dimensions is None:
1368 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.")
1369 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value:
1370 next_new_value = ({}, {}) # make new lists for next time
1371 group[0][task_node.label] = task_node
1372 for dataset_type_name, dataset_type_node in self.dataset_types.items():
1373 if dataset_type_node is None:
1374 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.")
1375 if not dataset_type_node.is_prerequisite or prerequisites:
1376 if (
1377 group := result.setdefault(
1378 dataset_type_node.dataset_type.dimensions.as_group(), next_new_value
1379 )
1380 ) is next_new_value:
1381 next_new_value = ({}, {}) # make new lists for next time
1382 group[1][dataset_type_node.name] = dataset_type_node
1383 return result
1385 def split_independent(self) -> Iterable[PipelineGraph]:
1386 """Iterate over independent subgraphs that together comprise this
1387 pipeline graph.
1389 Returns
1390 -------
1391 subgraphs : `Iterable` [ `PipelineGraph` ]
1392 An iterable over component subgraphs that could be run
1393 independently (they have only overall inputs in common). May be a
1394 lazy iterator.
1396 Notes
1397 -----
1398 All resolved dataset type nodes will be preserved.
1400 If there is only one component, ``self`` may be returned as the only
1401 element in the iterable.
1403 If `has_been_sorted`, all subgraphs will be sorted as well.
1404 """
1405 # Having an overall input in common isn't enough to make subgraphs
1406 # dependent on each other, so we want to look for connected component
1407 # subgraphs of the task-only projected graph.
1408 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False)
1409 task_keys = {
1410 key
1411 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1412 if bipartite == NodeType.TASK.bipartite
1413 }
1414 task_xgraph = networkx.algorithms.bipartite.projected_graph(
1415 networkx.DiGraph(bipartite_xgraph), task_keys
1416 )
1417 # "Weakly" connected means connected in only one direction, which is
1418 # the only kind of "connected" a DAG can ever be.
1419 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph):
1420 if component_task_keys == task_keys:
1421 yield self
1422 return
1423 else:
1424 component_subgraph = PipelineGraph(universe=self._universe)
1425 component_subgraph.add_task_nodes(
1426 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self
1427 )
1428 if self.has_been_sorted:
1429 component_subgraph.sort()
1430 yield component_subgraph
1432 ###########################################################################
1433 #
1434 # Class- and Package-Private Methods.
1435 #
1436 ###########################################################################
1438 def _iter_task_defs(self) -> Iterator[TaskDef]:
1439 """Iterate over this pipeline as a sequence of `TaskDef` instances.
1441 Notes
1442 -----
1443 This is a package-private method intended to aid in the transition to a
1444 codebase more fully integrated with the `PipelineGraph` class, in which
1445 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and
1446 much of the functionality on the `Pipeline` class will be moved to
1447 `PipelineGraph` as well.
1449 Raises
1450 ------
1451 TaskNotImportedError
1452 Raised if `TaskNode.is_imported` is `False` for any task.
1453 """
1454 from ..pipeline import TaskDef
1456 for node in self._tasks.values():
1457 yield TaskDef(
1458 config=node.config,
1459 taskClass=node.task_class,
1460 label=node.label,
1461 connections=node._get_imported_data().connections,
1462 )
1464 def _init_from_args(
1465 self,
1466 xgraph: networkx.MultiDiGraph | None,
1467 sorted_keys: Sequence[NodeKey] | None,
1468 task_subsets: dict[str, TaskSubset] | None,
1469 description: str,
1470 universe: DimensionUniverse | None,
1471 data_id: DataId | None,
1472 ) -> None:
1473 """Initialize the graph with possibly-nontrivial arguments.
1475 Parameters
1476 ----------
1477 xgraph : `networkx.MultiDiGraph` or `None`
1478 The backing networkx graph, or `None` to create an empty one.
1479 This graph has `NodeKey` instances for nodes and the same structure
1480 as the graph exported by `make_xgraph`, but its nodes and edges
1481 have a single ``instance`` attribute that holds a `TaskNode`,
1482 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or
1483 `WriteEdge` instance.
1484 sorted_keys : `Sequence` [ `NodeKey` ] or `None`
1485 Topologically sorted sequence of node keys, or `None` if the graph
1486 is not sorted.
1487 task_subsets : `dict` [ `str`, `TaskSubset` ]
1488 Labeled subsets of tasks. Values must be constructed with
1489 ``xgraph`` as their parent graph.
1490 description : `str`
1491 String description for this pipeline.
1492 universe : `lsst.daf.butler.DimensionUniverse` or `None`
1493 Definitions of all dimensions.
1494 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping.
1495 Data ID that represents a constraint on all quanta generated from
1496 this pipeline.
1498 Notes
1499 -----
1500 Only empty `PipelineGraph` instances should be constructed directly by
1501 users, which sets the signature of ``__init__`` itself, but methods on
1502 `PipelineGraph` and its helper classes need to be able to create them
1503 with state. Those methods can call this after calling ``__new__``
1504 manually, skipping ``__init__``.
1505 """
1506 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph()
1507 self._sorted_keys: Sequence[NodeKey] | None = None
1508 self._task_subsets = task_subsets if task_subsets is not None else {}
1509 self._description = description
1510 self._tasks = TaskMappingView(self._xgraph)
1511 self._dataset_types = DatasetTypeMappingView(self._xgraph)
1512 self._raw_data_id: dict[str, Any]
1513 if isinstance(data_id, DataCoordinate):
1514 if universe is None:
1515 universe = data_id.universe
1516 else:
1517 assert universe is data_id.universe, "data_id.universe and given universe differ"
1518 self._raw_data_id = dict(data_id.required)
1519 elif data_id is None:
1520 self._raw_data_id = {}
1521 else:
1522 self._raw_data_id = dict(data_id)
1523 self._universe = universe
1524 if sorted_keys is not None:
1525 self._reorder(sorted_keys)
1527 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph:
1528 """Make a bipartite init-only or runtime-only internal subgraph.
1530 See `make_bipartite_xgraph` for parameters and return values.
1532 Notes
1533 -----
1534 This method returns a view of the `PipelineGraph` object's internal
1535 backing graph, and hence should only be called in methods that copy the
1536 result either explicitly or by running a copying algorithm before
1537 returning it to the user.
1538 """
1539 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)])
1541 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G:
1542 """Transform networkx graph attributes in-place from the internal
1543 "instance" attributes to the documented exported attributes.
1545 Parameters
1546 ----------
1547 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1548 Graph whose state should be transformed.
1549 skip_edges : `bool`
1550 If `True`, do not transform edge state.
1552 Returns
1553 -------
1554 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1555 The same object passed in, after modification.
1557 Notes
1558 -----
1559 This should be called after making a copy of the internal graph but
1560 before any projection down to just task or dataset type nodes, since
1561 it assumes stateful edges.
1562 """
1563 state: dict[str, Any]
1564 for state in xgraph.nodes.values():
1565 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance")
1566 if node_value is not None:
1567 state.update(node_value._to_xgraph_state())
1568 else:
1569 # This is a dataset type node that is not resolved.
1570 state["bipartite"] = NodeType.DATASET_TYPE.bipartite
1571 if not skip_edges:
1572 for _, _, state in xgraph.edges(data=True):
1573 edge: Edge | None = state.pop("instance", None)
1574 if edge is not None:
1575 state.update(edge._to_xgraph_state())
1576 return xgraph
1578 def _replace_task_nodes(
1579 self,
1580 updates: Mapping[str, TaskNode],
1581 check_edges_unchanged: bool,
1582 assume_edges_unchanged: bool,
1583 message_header: str,
1584 ) -> None:
1585 """Replace task nodes and update edges and dataset type nodes
1586 accordingly.
1588 Parameters
1589 ----------
1590 updates : `Mapping` [ `str`, `TaskNode` ]
1591 New task nodes with task label keys. All keys must be task labels
1592 that are already present in the graph.
1593 check_edges_unchanged : `bool`, optional
1594 If `True`, require the edges (connections) of the modified tasks to
1595 remain unchanged after importing and configuring each task, and
1596 verify that this is the case.
1597 assume_edges_unchanged : `bool`, optional
1598 If `True`, the caller declares that the edges (connections) of the
1599 modified tasks will remain unchanged importing and configuring each
1600 task, and that it is unnecessary to check this.
1601 message_header : `str`
1602 Template for `str.format` with a single ``task_label`` placeholder
1603 to use as the first line in `EdgesChangedError` messages that show
1604 the differences between new task edges and old task edges. Should
1605 include the fact that the rest of the message will refer to the old
1606 task as "A" and the new task as "B", and end with a colon.
1608 Raises
1609 ------
1610 ValueError
1611 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
1612 are both `True`, or if a full config is provided for a task after
1613 another full config or an override has already been provided.
1614 EdgesChangedError
1615 Raised if ``check_edges_unchanged=True`` and the edges of a task do
1616 change.
1617 """
1618 deep: dict[str, TaskNode] = {}
1619 shallow: dict[str, TaskNode] = {}
1620 if assume_edges_unchanged:
1621 if check_edges_unchanged:
1622 raise ValueError("Cannot simultaneously assume and check that edges have not changed.")
1623 shallow.update(updates)
1624 else:
1625 for task_label, new_task_node in updates.items():
1626 old_task_node = self.tasks[task_label]
1627 messages = old_task_node.diff_edges(new_task_node)
1628 if messages:
1629 if check_edges_unchanged:
1630 messages.insert(0, message_header.format(task_label=task_label))
1631 raise EdgesChangedError("\n".join(messages))
1632 else:
1633 deep[task_label] = new_task_node
1634 else:
1635 shallow[task_label] = new_task_node
1636 try:
1637 if deep:
1638 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True)
1639 self.add_task_nodes(deep.values())
1640 for replaced_task_node, referencing_subsets in removed:
1641 for subset_label in referencing_subsets:
1642 self._task_subsets[subset_label].add(replaced_task_node.label)
1643 for task_node in shallow.values():
1644 self._xgraph.nodes[task_node.key]["instance"] = task_node
1645 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init
1646 except PipelineGraphExceptionSafetyError: # pragma: no cover
1647 raise
1648 except Exception as err: # pragma: no cover
1649 # There's no known way to get here, but we want to make it clear
1650 # it's a big problem if we do.
1651 raise PipelineGraphExceptionSafetyError(
1652 "Error while replacing tasks has left the graph in an inconsistent state."
1653 ) from err
1655 def _append_graph_data_from_edge(
1656 self,
1657 node_data: list[tuple[NodeKey, dict[str, Any]]],
1658 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]],
1659 edge: Edge,
1660 parent: PipelineGraph | None,
1661 ) -> None:
1662 """Append networkx state dictionaries for an edge and the corresponding
1663 dataset type node.
1665 Parameters
1666 ----------
1667 node_data : `list`
1668 List of node keys and state dictionaries. A node is appended if
1669 one does not already exist for this dataset type.
1670 edge_data : `list`
1671 List of node key pairs, connection names, and state dictionaries
1672 for edges.
1673 edge : `Edge`
1674 New edge being processed.
1675 parent : `PipelineGraph` or `None`
1676 Another pipeline graph whose dataset type nodes should be used
1677 when present.
1678 """
1679 new_dataset_type_node = None
1680 if parent is not None:
1681 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance")
1682 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None:
1683 existing_dataset_type_state["instance"] = new_dataset_type_node
1684 else:
1685 node_data.append(
1686 (
1687 edge.dataset_type_key,
1688 {
1689 "instance": new_dataset_type_node,
1690 "bipartite": NodeType.DATASET_TYPE.bipartite,
1691 },
1692 )
1693 )
1694 edge_data.append(
1695 edge.nodes
1696 + (
1697 edge.connection_name,
1698 {"instance": edge},
1699 )
1700 )
1702 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None:
1703 """Set the order of all views of this graph from the given sorted
1704 sequence of task labels and dataset type names.
1705 """
1706 self._sorted_keys = sorted_keys
1707 self._tasks._reorder(sorted_keys)
1708 self._dataset_types._reorder(sorted_keys)
1710 def _reset(self) -> None:
1711 """Reset the all views of this graph following a modification that
1712 might invalidate them.
1713 """
1714 self._sorted_keys = None
1715 self._tasks._reset()
1716 self._dataset_types._reset()
1718 _xgraph: networkx.MultiDiGraph
1719 _sorted_keys: Sequence[NodeKey] | None
1720 _task_subsets: dict[str, TaskSubset]
1721 _description: str
1722 _tasks: TaskMappingView
1723 _dataset_types: DatasetTypeMappingView
1724 _raw_data_id: dict[str, Any]
1725 _universe: DimensionUniverse | None