Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 20%
377 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-30 12:09 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-30 12:09 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("PipelineGraph",)
31import gzip
32import itertools
33import json
34from collections.abc import Iterable, Iterator, Mapping, Sequence
35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast
37import networkx
38import networkx.algorithms.bipartite
39import networkx.algorithms.dag
40from lsst.daf.butler import DataCoordinate, DataId, DimensionGroup, DimensionUniverse, Registry
41from lsst.resources import ResourcePath, ResourcePathExpression
43from ._dataset_types import DatasetTypeNode
44from ._edges import Edge, ReadEdge, WriteEdge
45from ._exceptions import (
46 DuplicateOutputError,
47 EdgesChangedError,
48 PipelineDataCycleError,
49 PipelineGraphError,
50 PipelineGraphExceptionSafetyError,
51 UnresolvedGraphError,
52)
53from ._mapping_views import DatasetTypeMappingView, TaskMappingView
54from ._nodes import NodeKey, NodeType
55from ._task_subsets import TaskSubset
56from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData
58if TYPE_CHECKING:
59 from ..config import PipelineTaskConfig
60 from ..connections import PipelineTaskConnections
61 from ..pipeline import TaskDef
62 from ..pipelineTask import PipelineTask
65_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph)
68class PipelineGraph:
69 """A graph representation of fully-configured pipeline.
71 `PipelineGraph` instances are typically constructed by calling
72 `.Pipeline.to_graph`, but in rare cases constructing and then populating an
73 empty one may be preferable.
75 Parameters
76 ----------
77 description : `str`, optional
78 String description for this pipeline.
79 universe : `lsst.daf.butler.DimensionUniverse`, optional
80 Definitions for all butler dimensions. If not provided, some
81 attributes will not be available until `resolve` is called.
82 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional
83 Data ID that represents a constraint on all quanta generated by this
84 pipeline. This typically just holds the instrument constraint included
85 in the pipeline definition, if there was one.
86 """
88 ###########################################################################
89 #
90 # Simple Pipeline Graph Inspection Interface:
91 #
92 # - for inspecting graph structure, not modifying it (except to sort and]
93 # resolve);
94 #
95 # - no NodeKey objects, just string dataset type name and task label keys;
96 #
97 # - graph structure is represented as a pair of mappings, with methods to
98 # find neighbors and edges of nodes.
99 #
100 ###########################################################################
102 def __init__(
103 self,
104 *,
105 description: str = "",
106 universe: DimensionUniverse | None = None,
107 data_id: DataId | None = None,
108 ) -> None:
109 self._init_from_args(
110 xgraph=None,
111 sorted_keys=None,
112 task_subsets=None,
113 description=description,
114 universe=universe,
115 data_id=data_id,
116 )
118 def __repr__(self) -> str:
119 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})"
121 @property
122 def description(self) -> str:
123 """String description for this pipeline."""
124 return self._description
126 @description.setter
127 def description(self, value: str) -> None:
128 # Docstring in setter.
129 self._description = value
131 @property
132 def universe(self) -> DimensionUniverse | None:
133 """Definitions for all butler dimensions."""
134 return self._universe
136 @property
137 def data_id(self) -> DataCoordinate:
138 """Data ID that represents a constraint on all quanta generated from
139 this pipeline.
141 This is may not be available unless `universe` is not `None`.
142 """
143 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe)
145 @property
146 def tasks(self) -> TaskMappingView:
147 """A mapping view of the tasks in the graph.
149 This mapping has `str` task label keys and `TaskNode` values. Iteration
150 is topologically and deterministically ordered if and only if `sort`
151 has been called since the last modification to the graph.
152 """
153 return self._tasks
155 @property
156 def dataset_types(self) -> DatasetTypeMappingView:
157 """A mapping view of the dataset types in the graph.
159 This mapping has `str` parent dataset type name keys, but only provides
160 access to its `DatasetTypeNode` values if `resolve` has been called
161 since the last modification involving a task that uses a dataset type.
162 See `DatasetTypeMappingView` for details.
163 """
164 return self._dataset_types
166 @property
167 def task_subsets(self) -> Mapping[str, TaskSubset]:
168 """A mapping of all labeled subsets of tasks.
170 Keys are subset labels, values are sets of task labels. See
171 `TaskSubset` for more information.
173 Use `add_task_subset` to add a new subset. The subsets themselves may
174 be modified in-place.
175 """
176 return self._task_subsets
178 @property
179 def is_fully_resolved(self) -> bool:
180 """Whether all of this graph's nodes are resolved."""
181 return self._universe is not None and all(
182 self.dataset_types.is_resolved(k) for k in self.dataset_types
183 )
185 @property
186 def is_sorted(self) -> bool:
187 """Whether this graph's tasks and dataset types are topologically
188 sorted with the exact same deterministic tiebreakers that `sort` would
189 apply.
191 This may perform (and then discard) a full sort if `has_been_sorted` is
192 `False`. If the goal is to obtain a sorted graph, it is better to just
193 call `sort` without guarding that with an ``if not graph.is_sorted``
194 check.
195 """
196 if self._sorted_keys is not None:
197 return True
198 return all(
199 sorted == unsorted
200 for sorted, unsorted in zip(
201 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True
202 )
203 )
205 @property
206 def has_been_sorted(self) -> bool:
207 """Whether this graph's tasks and dataset types have been
208 topologically sorted (with unspecified but deterministic tiebreakers)
209 since the last modification to the graph.
211 This may return `False` if the graph *happens* to be sorted but `sort`
212 was never called, but it is potentially much faster than `is_sorted`,
213 which may attempt (and then discard) a full sort if `has_been_sorted`
214 is `False`.
215 """
216 return self._sorted_keys is not None
218 def sort(self) -> None:
219 """Sort this graph's nodes topologically with deterministic (but
220 unspecified) tiebreakers.
222 This does nothing if the graph is already known to be sorted.
223 """
224 if self._sorted_keys is None:
225 try:
226 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph))
227 except networkx.NetworkXUnfeasible as err: # pragma: no cover
228 # Should't be possible to get here, because we check for cycles
229 # when adding tasks, but we guard against it anyway.
230 cycle = networkx.find_cycle(self._xgraph)
231 raise PipelineDataCycleError(
232 f"Cycle detected while attempting to sort graph: {cycle}."
233 ) from err
234 self._reorder(sorted_keys)
236 def copy(self) -> PipelineGraph:
237 """Return a copy of this graph that copies all mutable state."""
238 xgraph = self._xgraph.copy()
239 result = PipelineGraph.__new__(PipelineGraph)
240 result._init_from_args(
241 xgraph,
242 self._sorted_keys,
243 task_subsets={
244 k: TaskSubset(xgraph, v.label, set(v._members), v.description)
245 for k, v in self._task_subsets.items()
246 },
247 description=self._description,
248 universe=self.universe,
249 data_id=self._raw_data_id,
250 )
251 return result
253 def __copy__(self) -> PipelineGraph:
254 # Fully shallow copies are dangerous; we don't want shared mutable
255 # state to lead to broken class invariants.
256 return self.copy()
258 def __deepcopy__(self, memo: dict) -> PipelineGraph:
259 # Genuine deep copies are unnecessary, since we should only ever care
260 # that mutable state is copied.
261 return self.copy()
263 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None:
264 """Return the `WriteEdge` that links the producing task to the named
265 dataset type.
267 Parameters
268 ----------
269 dataset_type_name : `str`
270 Dataset type name. Must not be a component.
272 Returns
273 -------
274 edge : `WriteEdge` or `None`
275 Producing edge or `None` if there isn't one in this graph.
277 Raises
278 ------
279 DuplicateOutputError
280 Raised if there are multiple tasks defined to produce this dataset
281 type. This is only possible if the graph's dataset types are not
282 resolved.
284 Notes
285 -----
286 On resolved graphs, it may be slightly more efficient to use::
288 graph.dataset_types[dataset_type_name].producing_edge
290 but this method works on graphs with unresolved dataset types as well.
291 """
292 producer: str | None = None
293 producing_edge: WriteEdge | None = None
294 for _, _, producing_edge in self._xgraph.in_edges(
295 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
296 ):
297 assert producing_edge is not None, "Should only be None if we never loop."
298 if producer is not None:
299 raise DuplicateOutputError(
300 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} "
301 f"and {producer!r}."
302 )
303 return producing_edge
305 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]:
306 """Return the `ReadEdge` objects that link the named dataset type to
307 the tasks that consume it.
309 Parameters
310 ----------
311 dataset_type_name : `str`
312 Dataset type name. Must not be a component.
314 Returns
315 -------
316 edges : `list` [ `ReadEdge` ]
317 Edges that connect this dataset type to the tasks that consume it.
319 Notes
320 -----
321 On resolved graphs, it may be slightly more efficient to use::
323 graph.dataset_types[dataset_type_name].producing_edges
325 but this method works on graphs with unresolved dataset types as well.
326 """
327 return [
328 edge
329 for _, _, edge in self._xgraph.out_edges(
330 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
331 )
332 ]
334 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None:
335 """Return the `TaskNode` or `TaskInitNode` that writes the given
336 dataset type.
338 Parameters
339 ----------
340 dataset_type_name : `str`
341 Dataset type name. Must not be a component.
343 Returns
344 -------
345 edge : `TaskNode`, `TaskInitNode`, or `None`
346 Producing node or `None` if there isn't one in this graph.
348 Raises
349 ------
350 DuplicateOutputError
351 Raised if there are multiple tasks defined to produce this dataset
352 type. This is only possible if the graph's dataset types are not
353 resolved.
354 """
355 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None:
356 return self._xgraph.nodes[producing_edge.task_key]["instance"]
357 return None
359 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]:
360 """Return the `TaskNode` and/or `TaskInitNode` objects that read
361 the given dataset type.
363 Parameters
364 ----------
365 dataset_type_name : `str`
366 Dataset type name. Must not be a component.
368 Returns
369 -------
370 edges : `list` [ `ReadEdge` ]
371 Edges that connect this dataset type to the tasks that consume it.
373 Notes
374 -----
375 On resolved graphs, it may be slightly more efficient to use::
377 graph.dataset_types[dataset_type_name].producing_edges
379 but this method works on graphs with unresolved dataset types as well.
380 """
381 return [
382 self._xgraph.nodes[consuming_edge.task_key]["instance"]
383 for consuming_edge in self.consuming_edges_of(dataset_type_name)
384 ]
386 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]:
387 """Return the dataset types that are inputs to a task.
389 Parameters
390 ----------
391 task_label : `str`
392 Label for the task in the pipeline.
393 init : `bool`, optional
394 If `True`, return init-input dataset types instead of runtime
395 (including prerequisite) inputs.
397 Returns
398 -------
399 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
400 Dictionary parent dataset type name keys and either
401 `DatasetTypeNode` values (if the dataset type has been resolved)
402 or `None` values.
404 Notes
405 -----
406 To get the input edges of a task or task init node (which provide
407 information about storage class overrides nd components) use::
409 graph.tasks[task_label].iter_all_inputs()
411 or
413 graph.tasks[task_label].init.iter_all_inputs()
415 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
416 class.
417 """
418 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
419 return {
420 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
421 for edge in node.iter_all_inputs()
422 }
424 def outputs_of(
425 self, task_label: str, init: bool = False, include_automatic_connections: bool = True
426 ) -> dict[str, DatasetTypeNode | None]:
427 """Return the dataset types that are outputs of a task.
429 Parameters
430 ----------
431 task_label : `str`
432 Label for the task in the pipeline.
433 init : `bool`, optional
434 If `True`, return init-output dataset types instead of runtime
435 outputs.
436 include_automatic_connections : `bool`, optional
437 Whether to include automatic connections such as configs, metadata,
438 and logs.
440 Returns
441 -------
442 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
443 Dictionary parent dataset type name keys and either
444 `DatasetTypeNode` values (if the dataset type has been resolved)
445 or `None` values.
447 Notes
448 -----
449 To get the input edges of a task or task init node (which provide
450 information about storage class overrides nd components) use::
452 graph.tasks[task_label].iter_all_outputs()
454 or
456 graph.tasks[task_label].init.iter_all_outputs()
458 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
459 class.
460 """
461 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
462 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values()
463 return {
464 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
465 for edge in iterable
466 }
468 def resolve(self, registry: Registry) -> None:
469 """Resolve all dimensions and dataset types and check them for
470 consistency.
472 Resolving a graph also causes it to be sorted.
474 Parameters
475 ----------
476 registry : `lsst.daf.butler.Registry`
477 Client for the data repository to resolve against.
479 Notes
480 -----
481 The `universe` attribute is set to ``registry.dimensions`` and used to
482 set all `TaskNode.dimensions` attributes. Dataset type nodes are
483 resolved by first looking for a registry definition, then using the
484 producing task's definition, then looking for consistency between all
485 consuming task definitions.
487 Raises
488 ------
489 ConnectionTypeConsistencyError
490 Raised if a prerequisite input for one task appears as a different
491 kind of connection in any other task.
492 DuplicateOutputError
493 Raised if multiple tasks have the same dataset type as an output.
494 IncompatibleDatasetTypeError
495 Raised if different tasks have different definitions of a dataset
496 type. Different but compatible storage classes are permitted.
497 MissingDatasetTypeError
498 Raised if a dataset type definition is required to exist in the
499 data repository but none was found. This should only occur for
500 dataset types that are not produced by a task in the pipeline and
501 are consumed with different storage classes or as components by
502 tasks in the pipeline.
503 EdgesChangedError
504 Raised if ``check_edges_unchanged=True`` and the edges of a task do
505 change after import and reconfiguration.
506 """
507 node_key: NodeKey
508 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {}
509 for node_key, node_state in self._xgraph.nodes.items():
510 match node_key.node_type:
511 case NodeType.TASK:
512 task_node: TaskNode = node_state["instance"]
513 new_task_node = task_node._resolved(registry.dimensions)
514 if new_task_node is not task_node:
515 updates[node_key] = new_task_node
516 case NodeType.DATASET_TYPE:
517 dataset_type_node: DatasetTypeNode | None = node_state["instance"]
518 new_dataset_type_node = DatasetTypeNode._from_edges(
519 node_key, self._xgraph, registry, previous=dataset_type_node
520 )
521 # Usage of `is`` here is intentional; `_from_edges` returns
522 # `previous=dataset_type_node` if it can determine that it
523 # doesn't need to change.
524 if new_dataset_type_node is not dataset_type_node:
525 updates[node_key] = new_dataset_type_node
526 try:
527 for node_key, node_value in updates.items():
528 self._xgraph.nodes[node_key]["instance"] = node_value
529 except Exception as err: # pragma: no cover
530 # There's no known way to get here, but we want to make it
531 # clear it's a big problem if we do.
532 raise PipelineGraphExceptionSafetyError(
533 "Error during dataset type resolution has left the graph in an inconsistent state."
534 ) from err
535 self.sort()
536 self._universe = registry.dimensions
538 ###########################################################################
539 #
540 # Graph Modification Interface:
541 #
542 # - methods to add, remove, and replace tasks;
543 #
544 # - methods to add and remove task subsets.
545 #
546 # These are all things that are usually done in a Pipeline before making a
547 # graph at all, but there may be cases where we want to modify the graph
548 # instead. (These are also the methods used to make a graph from a
549 # Pipeline, or make a graph from another graph.)
550 #
551 ###########################################################################
553 def add_task(
554 self,
555 label: str,
556 task_class: type[PipelineTask],
557 config: PipelineTaskConfig,
558 connections: PipelineTaskConnections | None = None,
559 ) -> TaskNode:
560 """Add a new task to the graph.
562 Parameters
563 ----------
564 label : `str`
565 Label for the task in the pipeline.
566 task_class : `type` [ `PipelineTask` ]
567 Class object for the task.
568 config : `PipelineTaskConfig`
569 Configuration for the task.
570 connections : `PipelineTaskConnections`, optional
571 Object that describes the dataset types used by the task. If not
572 provided, one will be constructed from the given configuration. If
573 provided, it is assumed that ``config`` has already been validated
574 and frozen.
576 Returns
577 -------
578 node : `TaskNode`
579 The new task node added to the graph.
581 Raises
582 ------
583 ValueError
584 Raised if configuration validation failed when constructing
585 ``connections``.
586 PipelineDataCycleError
587 Raised if the graph is cyclic after this addition.
588 RuntimeError
589 Raised if an unexpected exception (which will be chained) occurred
590 at a stage that may have left the graph in an inconsistent state.
591 Other exceptions should leave the graph unchanged.
593 Notes
594 -----
595 Checks for dataset type consistency and multiple producers do not occur
596 until `resolve` is called, since the resolution depends on both the
597 state of the data repository and all contributing tasks.
599 Adding new tasks removes any existing resolutions of all dataset types
600 it references and marks the graph as unsorted. It is most effiecient
601 to add all tasks up front and only then resolve and/or sort the graph.
602 """
603 task_node = TaskNode._from_imported_data(
604 key=NodeKey(NodeType.TASK, label),
605 init_key=NodeKey(NodeType.TASK_INIT, label),
606 data=_TaskNodeImportedData.configure(label, task_class, config, connections),
607 universe=self.universe,
608 )
609 self.add_task_nodes([task_node])
610 return task_node
612 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None:
613 """Add one or more existing task nodes to the graph.
615 Parameters
616 ----------
617 nodes : `~collections.abc.Iterable` [ `TaskNode` ]
618 Iterable of task nodes to add. If any tasks have resolved
619 dimensions, they must have the same dimension universe as the rest
620 of the graph.
621 parent : `PipelineGraph`, optional
622 If provided, another `PipelineGraph` from which these nodes were
623 obtained. Any dataset type nodes already present in ``parent``
624 that are referenced by the given tasks will be used in this graph
625 if they are not already present, preserving any dataset type
626 resolutions present in the parent graph. Adding nodes from a
627 parent graph after the graph has its own nodes (e.g. from
628 `add_task`) or nodes from a third graph may result in invalid
629 dataset type resolutions. It is safest to only use this argument
630 when populating an empty graph for the first time.
632 Raises
633 ------
634 PipelineDataCycleError
635 Raised if the graph is cyclic after this addition.
637 Notes
638 -----
639 Checks for dataset type consistency and multiple producers do not occur
640 until `resolve` is called, since the resolution depends on both the
641 state of the data repository and all contributing tasks.
643 Adding new tasks removes any existing resolutions of all dataset types
644 it references (unless ``parent is not None`` and marks the graph as
645 unsorted. It is most efficient to add all tasks up front and only then
646 resolve and/or sort the graph.
647 """
648 node_data: list[tuple[NodeKey, dict[str, Any]]] = []
649 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = []
650 for task_node in nodes:
651 task_node = task_node._resolved(self._universe)
652 node_data.append(
653 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite})
654 )
655 node_data.append(
656 (
657 task_node.init.key,
658 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite},
659 )
660 )
661 # Convert the edge objects attached to the task node to networkx.
662 for read_edge in task_node.init.iter_all_inputs():
663 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
664 for write_edge in task_node.init.iter_all_outputs():
665 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
666 for read_edge in task_node.iter_all_inputs():
667 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
668 for write_edge in task_node.iter_all_outputs():
669 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
670 # Add a special edge (with no Edge instance) that connects the
671 # TaskInitNode to the runtime TaskNode.
672 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None}))
673 if not node_data and not edge_data:
674 return
675 # Checks and preparation complete; time to start the actual
676 # modification, during which it's hard to provide strong exception
677 # safety. Start by resetting the sort ordering, if there is one.
678 self._reset()
679 try:
680 self._xgraph.add_nodes_from(node_data)
681 self._xgraph.add_edges_from(edge_data)
682 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph):
683 cycle = networkx.find_cycle(self._xgraph)
684 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.")
685 except Exception:
686 # First try to roll back our changes.
687 try:
688 self._xgraph.remove_edges_from(edge_data)
689 self._xgraph.remove_nodes_from(key for key, _ in node_data)
690 except Exception as err: # pragma: no cover
691 # There's no known way to get here, but we want to make it
692 # clear it's a big problem if we do.
693 raise PipelineGraphExceptionSafetyError(
694 "Error while attempting to revert PipelineGraph modification has left the graph in "
695 "an inconsistent state."
696 ) from err
697 # Successfully rolled back; raise the original exception.
698 raise
700 def reconfigure_tasks(
701 self,
702 *args: tuple[str, PipelineTaskConfig],
703 check_edges_unchanged: bool = False,
704 assume_edges_unchanged: bool = False,
705 **kwargs: PipelineTaskConfig,
706 ) -> None:
707 """Update the configuration for one or more tasks.
709 Parameters
710 ----------
711 *args : `tuple` [ `str`, `.PipelineTaskConfig` ]
712 Positional arguments are each a 2-tuple of task label and new
713 config object. Note that the same arguments may also be passed as
714 ``**kwargs``, which is usually more readable, but task labels in
715 ``*args`` are not required to be valid Python identifiers.
716 check_edges_unchanged : `bool`, optional
717 If `True`, require the edges (connections) of the modified tasks to
718 remain unchanged after the configuration updates, and verify that
719 this is the case.
720 assume_edges_unchanged : `bool`, optional
721 If `True`, the caller declares that the edges (connections) of the
722 modified tasks will remain unchanged after the configuration
723 updates, and that it is unnecessary to check this.
724 **kwargs : `.PipelineTaskConfig`
725 New config objects or overrides to apply to copies of the current
726 config objects, with task labels as the keywords.
728 Raises
729 ------
730 ValueError
731 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
732 are both `True`, or if the same task appears twice.
733 EdgesChangedError
734 Raised if ``check_edges_unchanged=True`` and the edges of a task do
735 change.
737 Notes
738 -----
739 If reconfiguring a task causes its edges to change, any dataset type
740 nodes connected to that task (not just those whose edges have changed!)
741 will be unresolved.
742 """
743 new_configs: dict[str, PipelineTaskConfig] = {}
744 for task_label, config_update in itertools.chain(args, kwargs.items()):
745 if new_configs.setdefault(task_label, config_update) is not config_update:
746 raise ValueError(f"Config for {task_label!r} provided more than once.")
747 updates = {
748 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged)
749 for task_label, config in new_configs.items()
750 }
751 self._replace_task_nodes(
752 updates,
753 check_edges_unchanged=check_edges_unchanged,
754 assume_edges_unchanged=assume_edges_unchanged,
755 message_header=(
756 "Unexpected change in edges for task {task_label!r} from original config (A) to "
757 "new configs (B):"
758 ),
759 )
761 def remove_tasks(
762 self, labels: Iterable[str], drop_from_subsets: bool = True
763 ) -> list[tuple[TaskNode, set[str]]]:
764 """Remove one or more tasks from the graph.
766 Parameters
767 ----------
768 labels : `~collections.abc.Iterable` [ `str` ]
769 Iterable of the labels of the tasks to remove.
770 drop_from_subsets : `bool`, optional
771 If `True`, drop each removed task from any subset in which it
772 currently appears. If `False`, raise `PipelineGraphError` if any
773 such subsets exist.
775 Returns
776 -------
777 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ]
778 List of nodes removed and the labels of task subsets that
779 referenced them.
781 Raises
782 ------
783 PipelineGraphError
784 Raised if ``drop_from_subsets`` is `False` and the task is still
785 part of one or more subsets.
787 Notes
788 -----
789 Removing a task will cause dataset nodes with no other referencing
790 tasks to be removed. Any other dataset type nodes referenced by a
791 removed task will be reset to an "unresolved" state.
792 """
793 task_nodes_and_subsets = []
794 dataset_types: set[NodeKey] = set()
795 nodes_to_remove = set()
796 for label in labels:
797 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"]
798 # Find task subsets that reference this task.
799 referencing_subsets = {
800 subset_label
801 for subset_label, task_subset in self.task_subsets.items()
802 if label in task_subset
803 }
804 if not drop_from_subsets and referencing_subsets:
805 raise PipelineGraphError(
806 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}."
807 )
808 task_nodes_and_subsets.append((task_node, referencing_subsets))
809 # Find dataset types referenced by this task.
810 dataset_types.update(self._xgraph.predecessors(task_node.key))
811 dataset_types.update(self._xgraph.successors(task_node.key))
812 dataset_types.update(self._xgraph.predecessors(task_node.init.key))
813 dataset_types.update(self._xgraph.successors(task_node.init.key))
814 # Since there's an edge between the task and its init node, we'll
815 # have added those two nodes here, too, and we don't want that.
816 dataset_types.remove(task_node.init.key)
817 dataset_types.remove(task_node.key)
818 # Mark the task node and its init node for removal from the graph.
819 nodes_to_remove.add(task_node.key)
820 nodes_to_remove.add(task_node.init.key)
821 # Process the referenced datasets to see which ones are orphaned and
822 # need to be removed vs. just unresolved.
823 nodes_to_unresolve = []
824 for dataset_type_key in dataset_types:
825 related_tasks = set()
826 related_tasks.update(self._xgraph.predecessors(dataset_type_key))
827 related_tasks.update(self._xgraph.successors(dataset_type_key))
828 related_tasks.difference_update(nodes_to_remove)
829 if not related_tasks:
830 nodes_to_remove.add(dataset_type_key)
831 else:
832 nodes_to_unresolve.append(dataset_type_key)
833 # Checks and preparation complete; time to start the actual
834 # modification, during which it's hard to provide strong exception
835 # safety. Start by resetting the sort ordering.
836 self._reset()
837 try:
838 for dataset_type_key in nodes_to_unresolve:
839 self._xgraph.nodes[dataset_type_key]["instance"] = None
840 for task_node, referencing_subsets in task_nodes_and_subsets:
841 for subset_label in referencing_subsets:
842 self._task_subsets[subset_label].remove(task_node.label)
843 self._xgraph.remove_nodes_from(nodes_to_remove)
844 except Exception as err: # pragma: no cover
845 # There's no known way to get here, but we want to make it
846 # clear it's a big problem if we do.
847 raise PipelineGraphExceptionSafetyError(
848 "Error during task removal has left the graph in an inconsistent state."
849 ) from err
850 return task_nodes_and_subsets
852 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None:
853 """Add a label for a set of tasks that are already in the pipeline.
855 Parameters
856 ----------
857 subset_label : `str`
858 Label for this set of tasks.
859 task_labels : `~collections.abc.Iterable` [ `str` ]
860 Labels of the tasks to include in the set. All must already be
861 included in the graph.
862 description : `str`, optional
863 String description to associate with this label.
864 """
865 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description)
866 self._task_subsets[subset_label] = subset
868 def remove_task_subset(self, subset_label: str) -> None:
869 """Remove a labeled set of tasks."""
870 del self._task_subsets[subset_label]
872 ###########################################################################
873 #
874 # NetworkX Export Interface:
875 #
876 # - methods to export the PipelineGraph's content (or various subsets
877 # thereof) as NetworkX objects.
878 #
879 # These are particularly useful when writing tools to visualize the graph,
880 # while providing options for which aspects of the graph (tasks, dataset
881 # types, or both) to include, since all exported graphs have similar
882 # attributes regardless of their structure.
883 #
884 ###########################################################################
886 def make_xgraph(self) -> networkx.MultiDiGraph:
887 """Export a networkx representation of the full pipeline graph,
888 including both init and runtime edges.
890 Returns
891 -------
892 xgraph : `networkx.MultiDiGraph`
893 Directed acyclic graph with parallel edges.
895 Notes
896 -----
897 The returned graph uses `NodeKey` instances for nodes. Parallel edges
898 represent the same dataset type appearing in multiple connections for
899 the same task, and are hence rare. The connection name is used as the
900 edge key to disambiguate those parallel edges.
902 Almost all edges connect dataset type nodes to task or task init nodes
903 or vice versa, but there is also a special edge that connects each task
904 init node to its runtime node. The existence of these edges makes the
905 graph not quite bipartite, though its init-only and runtime-only
906 subgraphs are bipartite.
908 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
909 `WriteEdge` for the descriptive node and edge attributes added.
910 """
911 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False)
913 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph:
914 """Return a bipartite networkx representation of just the runtime or
915 init-time pipeline graph.
917 Parameters
918 ----------
919 init : `bool`, optional
920 If `True` (`False` is default) return the graph of task
921 initialization nodes and init input/output dataset types, instead
922 of the graph of runtime task nodes and regular
923 input/output/prerequisite dataset types.
925 Returns
926 -------
927 xgraph : `networkx.MultiDiGraph`
928 Directed acyclic graph with parallel edges.
930 Notes
931 -----
932 The returned graph uses `NodeKey` instances for nodes. Parallel edges
933 represent the same dataset type appearing in multiple connections for
934 the same task, and are hence rare. The connection name is used as the
935 edge key to disambiguate those parallel edges.
937 This graph is bipartite because each dataset type node only has edges
938 that connect it to a task [init] node, and vice versa.
940 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
941 `WriteEdge` for the descriptive node and edge attributes added.
942 """
943 return self._transform_xgraph_state(
944 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False
945 )
947 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph:
948 """Return a networkx representation of just the tasks in the pipeline.
950 Parameters
951 ----------
952 init : `bool`, optional
953 If `True` (`False` is default) return the graph of task
954 initialization nodes, instead of the graph of runtime task nodes.
956 Returns
957 -------
958 xgraph : `networkx.DiGraph`
959 Directed acyclic graph with no parallel edges.
961 Notes
962 -----
963 The returned graph uses `NodeKey` instances for nodes. The dataset
964 types that link these tasks are not represented at all; edges have no
965 attributes, and there are no parallel edges.
967 See `TaskNode` and `TaskInitNode` for the descriptive node and
968 attributes added.
969 """
970 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
971 task_keys = [
972 key
973 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
974 if bipartite == NodeType.TASK.bipartite
975 ]
976 return self._transform_xgraph_state(
977 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys),
978 skip_edges=True,
979 )
981 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph:
982 """Return a networkx representation of just the dataset types in the
983 pipeline.
985 Parameters
986 ----------
987 init : `bool`, optional
988 If `True` (`False` is default) return the graph of init input and
989 output dataset types, instead of the graph of runtime (input,
990 output, prerequisite input) dataset types.
992 Returns
993 -------
994 xgraph : `networkx.DiGraph`
995 Directed acyclic graph with no parallel edges.
997 Notes
998 -----
999 The returned graph uses `NodeKey` instances for nodes. The tasks that
1000 link these tasks are not represented at all; edges have no attributes,
1001 and there are no parallel edges.
1003 See `DatasetTypeNode` for the descriptive node and attributes added.
1004 """
1005 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
1006 dataset_type_keys = [
1007 key
1008 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1009 if bipartite == NodeType.DATASET_TYPE.bipartite
1010 ]
1011 return self._transform_xgraph_state(
1012 networkx.algorithms.bipartite.projected_graph(
1013 networkx.DiGraph(bipartite_xgraph), dataset_type_keys
1014 ),
1015 skip_edges=True,
1016 )
1018 ###########################################################################
1019 #
1020 # Serialization Interface.
1021 #
1022 # Serialization of PipelineGraphs is currently experimental and may not be
1023 # retained in the future. All serialization methods are
1024 # underscore-prefixed to ensure nobody mistakes them for a stable interface
1025 # (let a lone a stable file format).
1026 #
1027 ###########################################################################
1029 @classmethod
1030 def _read_stream(
1031 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1032 ) -> PipelineGraph:
1033 """Read a serialized `PipelineGraph` from a file-like object.
1035 Parameters
1036 ----------
1037 stream : `BinaryIO`
1038 File-like object opened for binary reading, containing
1039 gzip-compressed JSON.
1040 import_mode : `TaskImportMode`, optional
1041 Whether to import tasks, and how to reconcile any differences
1042 between the imported task's connections and the those that were
1043 persisted with the graph. Default is to check that they are the
1044 same.
1046 Returns
1047 -------
1048 graph : `PipelineGraph`
1049 Deserialized pipeline graph.
1051 Raises
1052 ------
1053 PipelineGraphReadError
1054 Raised if the serialized `PipelineGraph` is not self-consistent.
1055 EdgesChangedError
1056 Raised if ``import_mode`` is
1057 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1058 did change after import and reconfiguration.
1060 Notes
1061 -----
1062 `PipelineGraph` serialization is currently experimental and may be
1063 removed or significantly changed in the future, with no deprecation
1064 period.
1065 """
1066 from .io import SerializedPipelineGraph
1068 with gzip.open(stream, "rb") as uncompressed_stream:
1069 data = json.load(uncompressed_stream)
1070 serialized_graph = SerializedPipelineGraph.parse_obj(data)
1071 return serialized_graph.deserialize(import_mode)
1073 @classmethod
1074 def _read_uri(
1075 cls,
1076 uri: ResourcePathExpression,
1077 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES,
1078 ) -> PipelineGraph:
1079 """Read a serialized `PipelineGraph` from a file at a URI.
1081 Parameters
1082 ----------
1083 uri : convertible to `lsst.resources.ResourcePath`
1084 URI to a gzip-compressed JSON file containing a serialized pipeline
1085 graph.
1086 import_mode : `TaskImportMode`, optional
1087 Whether to import tasks, and how to reconcile any differences
1088 between the imported task's connections and the those that were
1089 persisted with the graph. Default is to check that they are the
1090 same.
1092 Returns
1093 -------
1094 graph : `PipelineGraph`
1095 Deserialized pipeline graph.
1097 Raises
1098 ------
1099 PipelineGraphReadError
1100 Raised if the serialized `PipelineGraph` is not self-consistent.
1101 EdgesChangedError
1102 Raised if ``import_mode`` is
1103 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1104 did change after import and reconfiguration.
1106 Notes
1107 -----
1108 `PipelineGraph` serialization is currently experimental and may be
1109 removed or significantly changed in the future, with no deprecation
1110 period.
1111 """
1112 uri = ResourcePath(uri)
1113 with uri.open("rb") as stream:
1114 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode)
1116 def _write_stream(self, stream: BinaryIO) -> None:
1117 """Write the pipeline to a file-like object.
1119 Parameters
1120 ----------
1121 stream
1122 File-like object opened for binary writing.
1124 Notes
1125 -----
1126 `PipelineGraph` serialization is currently experimental and may be
1127 removed or significantly changed in the future, with no deprecation
1128 period.
1130 The file format is gzipped JSON, and is intended to be human-readable,
1131 but it should not be considered a stable public interface for outside
1132 code, which should always use `PipelineGraph` methods (or at least the
1133 `io.SerializedPipelineGraph` class) to read these files.
1134 """
1135 from .io import SerializedPipelineGraph
1137 with gzip.open(stream, mode="wb") as compressed_stream:
1138 compressed_stream.write(
1139 SerializedPipelineGraph.serialize(self).json(exclude_defaults=True).encode("utf-8")
1140 )
1142 def _write_uri(self, uri: ResourcePathExpression) -> None:
1143 """Write the pipeline to a file given a URI.
1145 Parameters
1146 ----------
1147 uri : convertible to `lsst.resources.ResourcePath`
1148 URI to write to . May have ``.json.gz`` or no extension (which
1149 will cause a ``.json.gz`` extension to be added).
1151 Notes
1152 -----
1153 `PipelineGraph` serialization is currently experimental and may be
1154 removed or significantly changed in the future, with no deprecation
1155 period.
1157 The file format is gzipped JSON, and is intended to be human-readable,
1158 but it should not be considered a stable public interface for outside
1159 code, which should always use `PipelineGraph` methods (or at least the
1160 `io.SerializedPipelineGraph` class) to read these files.
1161 """
1162 uri = ResourcePath(uri)
1163 extension = uri.getExtension()
1164 if not extension:
1165 uri = uri.updatedExtension(".json.gz")
1166 elif extension != ".json.gz":
1167 raise ValueError("Expanded pipeline files should always have a .json.gz extension.")
1168 with uri.open(mode="wb") as stream:
1169 self._write_stream(cast(BinaryIO, stream))
1171 def _import_and_configure(
1172 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1173 ) -> None:
1174 """Import the `PipelineTask` classes referenced by all task nodes and
1175 update those nodes accordingly.
1177 Parameters
1178 ----------
1179 import_mode : `TaskImportMode`, optional
1180 Whether to import tasks, and how to reconcile any differences
1181 between the imported task's connections and the those that were
1182 persisted with the graph. Default is to check that they are the
1183 same. This method does nothing if this is
1184 `TaskImportMode.DO_NOT_IMPORT`.
1186 Raises
1187 ------
1188 EdgesChangedError
1189 Raised if ``import_mode`` is
1190 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1191 did change after import and reconfiguration.
1193 Notes
1194 -----
1195 This method shouldn't need to be called unless the graph was
1196 deserialized without importing and configuring immediately, which is
1197 not the default behavior (but it can greatly speed up deserialization).
1198 If all tasks have already been imported this does nothing.
1200 Importing and configuring a task can change its
1201 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output,
1202 usually because the software used to read a serialized graph is newer
1203 than the software used to write it (e.g. a new config option has been
1204 added, or the task was moved to a new module with a forwarding alias
1205 left behind). These changes are allowed by
1206 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`.
1208 If importing and configuring a task causes its edges to change, any
1209 dataset type nodes linked to those edges will be reset to the
1210 unresolved state.
1211 """
1212 if import_mode is TaskImportMode.DO_NOT_IMPORT:
1213 return
1214 rebuild = (
1215 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES
1216 or import_mode is TaskImportMode.OVERRIDE_EDGES
1217 )
1218 updates: dict[str, TaskNode] = {}
1219 node_key: NodeKey
1220 for node_key, node_state in self._xgraph.nodes.items():
1221 if node_key.node_type is NodeType.TASK:
1222 task_node: TaskNode = node_state["instance"]
1223 new_task_node = task_node._imported_and_configured(rebuild)
1224 if new_task_node is not task_node:
1225 updates[task_node.label] = new_task_node
1226 self._replace_task_nodes(
1227 updates,
1228 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES),
1229 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES),
1230 message_header=(
1231 "In task with label {task_label!r}, persisted edges (A)"
1232 "differ from imported and configured edges (B):"
1233 ),
1234 )
1236 ###########################################################################
1237 #
1238 # Advanced PipelineGraph Inspection Interface:
1239 #
1240 # - methods to iterate over all nodes and edges, utilizing NodeKeys;
1241 #
1242 # - methods to find overall inputs and group nodes by their dimensions,
1243 # which are important operations for QuantumGraph generation.
1244 #
1245 ###########################################################################
1247 def iter_edges(self, init: bool = False) -> Iterator[Edge]:
1248 """Iterate over edges in the graph.
1250 Parameters
1251 ----------
1252 init : `bool`, optional
1253 If `True` (`False` is default) iterate over the edges between task
1254 initialization node and init input/output dataset types, instead of
1255 the runtime task nodes and regular input/output/prerequisite
1256 dataset types.
1258 Returns
1259 -------
1260 edges : `~collections.abc.Iterator` [ `Edge` ]
1261 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances.
1263 Notes
1264 -----
1265 This method always returns _either_ init edges or runtime edges, never
1266 both. The full (internal) graph that contains both also includes a
1267 special edge that connects each task init node to its runtime node;
1268 that is also never returned by this method, since it is never a part of
1269 the init-only or runtime-only subgraphs.
1270 """
1271 edge: Edge
1272 for _, _, edge in self._xgraph.edges(data="instance"):
1273 if edge is not None and edge.is_init == init:
1274 yield edge
1276 def iter_nodes(
1277 self,
1278 ) -> Iterator[
1279 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode]
1280 | tuple[Literal[NodeType.TASK], str, TaskInitNode]
1281 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None]
1282 ]:
1283 """Iterate over nodes in the graph.
1285 Returns
1286 -------
1287 nodes : `~collections.abc.Iterator` [ `tuple` ]
1288 A lazy iterator over all of the nodes in the graph. Each yielded
1289 element is a tuple of:
1291 - the node type enum value (`NodeType`);
1292 - the string name for the node (task label or parent dataset type
1293 name);
1294 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`,
1295 or `None` for dataset type nodes that have not been resolved).
1296 """
1297 key: NodeKey
1298 if self._sorted_keys is not None:
1299 for key in self._sorted_keys:
1300 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore
1301 else:
1302 for key, node in self._xgraph.nodes(data="instance"):
1303 yield key.node_type, key.name, node # type: ignore
1305 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]:
1306 """Iterate over all of the dataset types that are consumed but not
1307 produced by the graph.
1309 Returns
1310 -------
1311 dataset_types : `~collections.abc.Iterator` [ `tuple` ]
1312 A lazy iterator over the overall-input dataset types (including
1313 overall init inputs and prerequisites). Each yielded element is a
1314 tuple of:
1316 - the parent dataset type name;
1317 - the resolved `DatasetTypeNode`, or `None` if the dataset type has
1318 - not been resolved.
1319 """
1320 for generation in networkx.algorithms.dag.topological_generations(self._xgraph):
1321 key: NodeKey
1322 for key in generation:
1323 # While we expect all tasks to have at least one input and
1324 # hence never appear in the first topological generation, that
1325 # is not true of task init nodes.
1326 if key.node_type is NodeType.DATASET_TYPE:
1327 yield key.name, self._xgraph.nodes[key]["instance"]
1328 return
1330 def group_by_dimensions(
1331 self, prerequisites: bool = False
1332 ) -> dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]:
1333 """Group this graph's tasks and dataset types by their dimensions.
1335 Parameters
1336 ----------
1337 prerequisites : `bool`, optional
1338 If `True`, include prerequisite dataset types as well as regular
1339 input and output datasets (including intermediates).
1341 Returns
1342 -------
1343 groups : `dict` [ `DimensionGroup`, `tuple` ]
1344 A dictionary of groups keyed by `DimensionGroup`, in which each
1345 value is a tuple of:
1347 - a `dict` of `TaskNode` instances, keyed by task label
1348 - a `dict` of `DatasetTypeNode` instances, keyed by
1349 dataset type name.
1351 that have those dimensions.
1353 Notes
1354 -----
1355 Init inputs and outputs are always included, but always have empty
1356 dimensions and are hence are all grouped together.
1357 """
1358 result: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {}
1359 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {})
1360 for task_label, task_node in self.tasks.items():
1361 if task_node.dimensions is None:
1362 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.")
1363 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value:
1364 next_new_value = ({}, {}) # make new lists for next time
1365 group[0][task_node.label] = task_node
1366 for dataset_type_name, dataset_type_node in self.dataset_types.items():
1367 if dataset_type_node is None:
1368 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.")
1369 if not dataset_type_node.is_prerequisite or prerequisites:
1370 if (
1371 group := result.setdefault(
1372 dataset_type_node.dataset_type.dimensions.as_group(), next_new_value
1373 )
1374 ) is next_new_value:
1375 next_new_value = ({}, {}) # make new lists for next time
1376 group[1][dataset_type_node.name] = dataset_type_node
1377 return result
1379 def split_independent(self) -> Iterable[PipelineGraph]:
1380 """Iterate over independent subgraphs that together comprise this
1381 pipeline graph.
1383 Returns
1384 -------
1385 subgraphs : `Iterable` [ `PipelineGraph` ]
1386 An iterable over component subgraphs that could be run
1387 independently (they have only overall inputs in common). May be a
1388 lazy iterator.
1390 Notes
1391 -----
1392 All resolved dataset type nodes will be preserved.
1394 If there is only one component, ``self`` may be returned as the only
1395 element in the iterable.
1397 If `has_been_sorted`, all subgraphs will be sorted as well.
1398 """
1399 # Having an overall input in common isn't enough to make subgraphs
1400 # dependent on each other, so we want to look for connected component
1401 # subgraphs of the task-only projected graph.
1402 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False)
1403 task_keys = {
1404 key
1405 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1406 if bipartite == NodeType.TASK.bipartite
1407 }
1408 task_xgraph = networkx.algorithms.bipartite.projected_graph(
1409 networkx.DiGraph(bipartite_xgraph), task_keys
1410 )
1411 # "Weakly" connected means connected in only one direction, which is
1412 # the only kind of "connected" a DAG can ever be.
1413 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph):
1414 if component_task_keys == task_keys:
1415 yield self
1416 return
1417 else:
1418 component_subgraph = PipelineGraph(universe=self._universe)
1419 component_subgraph.add_task_nodes(
1420 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self
1421 )
1422 if self.has_been_sorted:
1423 component_subgraph.sort()
1424 yield component_subgraph
1426 ###########################################################################
1427 #
1428 # Class- and Package-Private Methods.
1429 #
1430 ###########################################################################
1432 def _iter_task_defs(self) -> Iterator[TaskDef]:
1433 """Iterate over this pipeline as a sequence of `TaskDef` instances.
1435 Notes
1436 -----
1437 This is a package-private method intended to aid in the transition to a
1438 codebase more fully integrated with the `PipelineGraph` class, in which
1439 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and
1440 much of the functionality on the `Pipeline` class will be moved to
1441 `PipelineGraph` as well.
1443 Raises
1444 ------
1445 TaskNotImportedError
1446 Raised if `TaskNode.is_imported` is `False` for any task.
1447 """
1448 from ..pipeline import TaskDef
1450 for node in self._tasks.values():
1451 yield TaskDef(
1452 config=node.config,
1453 taskClass=node.task_class,
1454 label=node.label,
1455 connections=node._get_imported_data().connections,
1456 )
1458 def _init_from_args(
1459 self,
1460 xgraph: networkx.MultiDiGraph | None,
1461 sorted_keys: Sequence[NodeKey] | None,
1462 task_subsets: dict[str, TaskSubset] | None,
1463 description: str,
1464 universe: DimensionUniverse | None,
1465 data_id: DataId | None,
1466 ) -> None:
1467 """Initialize the graph with possibly-nontrivial arguments.
1469 Parameters
1470 ----------
1471 xgraph : `networkx.MultiDiGraph` or `None`
1472 The backing networkx graph, or `None` to create an empty one.
1473 This graph has `NodeKey` instances for nodes and the same structure
1474 as the graph exported by `make_xgraph`, but its nodes and edges
1475 have a single ``instance`` attribute that holds a `TaskNode`,
1476 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or
1477 `WriteEdge` instance.
1478 sorted_keys : `Sequence` [ `NodeKey` ] or `None`
1479 Topologically sorted sequence of node keys, or `None` if the graph
1480 is not sorted.
1481 task_subsets : `dict` [ `str`, `TaskSubset` ]
1482 Labeled subsets of tasks. Values must be constructed with
1483 ``xgraph`` as their parent graph.
1484 description : `str`
1485 String description for this pipeline.
1486 universe : `lsst.daf.butler.DimensionUniverse` or `None`
1487 Definitions of all dimensions.
1488 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping.
1489 Data ID that represents a constraint on all quanta generated from
1490 this pipeline.
1492 Notes
1493 -----
1494 Only empty `PipelineGraph` instances should be constructed directly by
1495 users, which sets the signature of ``__init__`` itself, but methods on
1496 `PipelineGraph` and its helper classes need to be able to create them
1497 with state. Those methods can call this after calling ``__new__``
1498 manually, skipping ``__init__``.
1499 """
1500 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph()
1501 self._sorted_keys: Sequence[NodeKey] | None = None
1502 self._task_subsets = task_subsets if task_subsets is not None else {}
1503 self._description = description
1504 self._tasks = TaskMappingView(self._xgraph)
1505 self._dataset_types = DatasetTypeMappingView(self._xgraph)
1506 self._raw_data_id: dict[str, Any]
1507 if isinstance(data_id, DataCoordinate):
1508 if universe is None:
1509 universe = data_id.universe
1510 else:
1511 assert universe is data_id.universe, "data_id.universe and given universe differ"
1512 self._raw_data_id = dict(data_id.required)
1513 elif data_id is None:
1514 self._raw_data_id = {}
1515 else:
1516 self._raw_data_id = dict(data_id)
1517 self._universe = universe
1518 if sorted_keys is not None:
1519 self._reorder(sorted_keys)
1521 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph:
1522 """Make a bipartite init-only or runtime-only internal subgraph.
1524 See `make_bipartite_xgraph` for parameters and return values.
1526 Notes
1527 -----
1528 This method returns a view of the `PipelineGraph` object's internal
1529 backing graph, and hence should only be called in methods that copy the
1530 result either explicitly or by running a copying algorithm before
1531 returning it to the user.
1532 """
1533 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)])
1535 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G:
1536 """Transform networkx graph attributes in-place from the internal
1537 "instance" attributes to the documented exported attributes.
1539 Parameters
1540 ----------
1541 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1542 Graph whose state should be transformed.
1543 skip_edges : `bool`
1544 If `True`, do not transform edge state.
1546 Returns
1547 -------
1548 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1549 The same object passed in, after modification.
1551 Notes
1552 -----
1553 This should be called after making a copy of the internal graph but
1554 before any projection down to just task or dataset type nodes, since
1555 it assumes stateful edges.
1556 """
1557 state: dict[str, Any]
1558 for state in xgraph.nodes.values():
1559 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance")
1560 if node_value is not None:
1561 state.update(node_value._to_xgraph_state())
1562 else:
1563 # This is a dataset type node that is not resolved.
1564 state["bipartite"] = NodeType.DATASET_TYPE.bipartite
1565 if not skip_edges:
1566 for _, _, state in xgraph.edges(data=True):
1567 edge: Edge | None = state.pop("instance", None)
1568 if edge is not None:
1569 state.update(edge._to_xgraph_state())
1570 return xgraph
1572 def _replace_task_nodes(
1573 self,
1574 updates: Mapping[str, TaskNode],
1575 check_edges_unchanged: bool,
1576 assume_edges_unchanged: bool,
1577 message_header: str,
1578 ) -> None:
1579 """Replace task nodes and update edges and dataset type nodes
1580 accordingly.
1582 Parameters
1583 ----------
1584 updates : `Mapping` [ `str`, `TaskNode` ]
1585 New task nodes with task label keys. All keys must be task labels
1586 that are already present in the graph.
1587 check_edges_unchanged : `bool`, optional
1588 If `True`, require the edges (connections) of the modified tasks to
1589 remain unchanged after importing and configuring each task, and
1590 verify that this is the case.
1591 assume_edges_unchanged : `bool`, optional
1592 If `True`, the caller declares that the edges (connections) of the
1593 modified tasks will remain unchanged importing and configuring each
1594 task, and that it is unnecessary to check this.
1595 message_header : `str`
1596 Template for `str.format` with a single ``task_label`` placeholder
1597 to use as the first line in `EdgesChangedError` messages that show
1598 the differences between new task edges and old task edges. Should
1599 include the fact that the rest of the message will refer to the old
1600 task as "A" and the new task as "B", and end with a colon.
1602 Raises
1603 ------
1604 ValueError
1605 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
1606 are both `True`, or if a full config is provided for a task after
1607 another full config or an override has already been provided.
1608 EdgesChangedError
1609 Raised if ``check_edges_unchanged=True`` and the edges of a task do
1610 change.
1611 """
1612 deep: dict[str, TaskNode] = {}
1613 shallow: dict[str, TaskNode] = {}
1614 if assume_edges_unchanged:
1615 if check_edges_unchanged:
1616 raise ValueError("Cannot simultaneously assume and check that edges have not changed.")
1617 shallow.update(updates)
1618 else:
1619 for task_label, new_task_node in updates.items():
1620 old_task_node = self.tasks[task_label]
1621 messages = old_task_node.diff_edges(new_task_node)
1622 if messages:
1623 if check_edges_unchanged:
1624 messages.insert(0, message_header.format(task_label=task_label))
1625 raise EdgesChangedError("\n".join(messages))
1626 else:
1627 deep[task_label] = new_task_node
1628 else:
1629 shallow[task_label] = new_task_node
1630 try:
1631 if deep:
1632 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True)
1633 self.add_task_nodes(deep.values())
1634 for replaced_task_node, referencing_subsets in removed:
1635 for subset_label in referencing_subsets:
1636 self._task_subsets[subset_label].add(replaced_task_node.label)
1637 for task_node in shallow.values():
1638 self._xgraph.nodes[task_node.key]["instance"] = task_node
1639 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init
1640 except PipelineGraphExceptionSafetyError: # pragma: no cover
1641 raise
1642 except Exception as err: # pragma: no cover
1643 # There's no known way to get here, but we want to make it clear
1644 # it's a big problem if we do.
1645 raise PipelineGraphExceptionSafetyError(
1646 "Error while replacing tasks has left the graph in an inconsistent state."
1647 ) from err
1649 def _append_graph_data_from_edge(
1650 self,
1651 node_data: list[tuple[NodeKey, dict[str, Any]]],
1652 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]],
1653 edge: Edge,
1654 parent: PipelineGraph | None,
1655 ) -> None:
1656 """Append networkx state dictionaries for an edge and the corresponding
1657 dataset type node.
1659 Parameters
1660 ----------
1661 node_data : `list`
1662 List of node keys and state dictionaries. A node is appended if
1663 one does not already exist for this dataset type.
1664 edge_data : `list`
1665 List of node key pairs, connection names, and state dictionaries
1666 for edges.
1667 edge : `Edge`
1668 New edge being processed.
1669 parent : `PipelineGraph` or `None`
1670 Another pipeline graph whose dataset type nodes should be used
1671 when present.
1672 """
1673 new_dataset_type_node = None
1674 if parent is not None:
1675 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance")
1676 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None:
1677 existing_dataset_type_state["instance"] = new_dataset_type_node
1678 else:
1679 node_data.append(
1680 (
1681 edge.dataset_type_key,
1682 {
1683 "instance": new_dataset_type_node,
1684 "bipartite": NodeType.DATASET_TYPE.bipartite,
1685 },
1686 )
1687 )
1688 edge_data.append(
1689 edge.nodes
1690 + (
1691 edge.connection_name,
1692 {"instance": edge},
1693 )
1694 )
1696 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None:
1697 """Set the order of all views of this graph from the given sorted
1698 sequence of task labels and dataset type names.
1699 """
1700 self._sorted_keys = sorted_keys
1701 self._tasks._reorder(sorted_keys)
1702 self._dataset_types._reorder(sorted_keys)
1704 def _reset(self) -> None:
1705 """Reset the all views of this graph following a modification that
1706 might invalidate them.
1707 """
1708 self._sorted_keys = None
1709 self._tasks._reset()
1710 self._dataset_types._reset()
1712 _xgraph: networkx.MultiDiGraph
1713 _sorted_keys: Sequence[NodeKey] | None
1714 _task_subsets: dict[str, TaskSubset]
1715 _description: str
1716 _tasks: TaskMappingView
1717 _dataset_types: DatasetTypeMappingView
1718 _raw_data_id: dict[str, Any]
1719 _universe: DimensionUniverse | None