Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 19%
373 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:39 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:39 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("PipelineGraph",)
25import gzip
26import itertools
27import json
28from collections.abc import Iterable, Iterator, Mapping, Sequence
29from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast
31import networkx
32import networkx.algorithms.bipartite
33import networkx.algorithms.dag
34from lsst.daf.butler import DataCoordinate, DataId, DimensionGraph, DimensionUniverse, Registry
35from lsst.resources import ResourcePath, ResourcePathExpression
37from ._dataset_types import DatasetTypeNode
38from ._edges import Edge, ReadEdge, WriteEdge
39from ._exceptions import (
40 DuplicateOutputError,
41 EdgesChangedError,
42 PipelineDataCycleError,
43 PipelineGraphError,
44 PipelineGraphExceptionSafetyError,
45 UnresolvedGraphError,
46)
47from ._mapping_views import DatasetTypeMappingView, TaskMappingView
48from ._nodes import NodeKey, NodeType
49from ._task_subsets import TaskSubset
50from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData
52if TYPE_CHECKING:
53 from ..config import PipelineTaskConfig
54 from ..connections import PipelineTaskConnections
55 from ..pipeline import TaskDef
56 from ..pipelineTask import PipelineTask
59_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph)
62class PipelineGraph:
63 """A graph representation of fully-configured pipeline.
65 `PipelineGraph` instances are typically constructed by calling
66 `.Pipeline.to_graph`, but in rare cases constructing and then populating an
67 empty one may be preferable.
69 Parameters
70 ----------
71 description : `str`, optional
72 String description for this pipeline.
73 universe : `lsst.daf.butler.DimensionUniverse`, optional
74 Definitions for all butler dimensions. If not provided, some
75 attributes will not be available until `resolve` is called.
76 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional
77 Data ID that represents a constraint on all quanta generated by this
78 pipeline. This typically just holds the instrument constraint included
79 in the pipeline definition, if there was one.
80 """
82 ###########################################################################
83 #
84 # Simple Pipeline Graph Inspection Interface:
85 #
86 # - for inspecting graph structure, not modifying it (except to sort and]
87 # resolve);
88 #
89 # - no NodeKey objects, just string dataset type name and task label keys;
90 #
91 # - graph structure is represented as a pair of mappings, with methods to
92 # find neighbors and edges of nodes.
93 #
94 ###########################################################################
96 def __init__(
97 self,
98 *,
99 description: str = "",
100 universe: DimensionUniverse | None = None,
101 data_id: DataId | None = None,
102 ) -> None:
103 self._init_from_args(
104 xgraph=None,
105 sorted_keys=None,
106 task_subsets=None,
107 description=description,
108 universe=universe,
109 data_id=data_id,
110 )
112 def __repr__(self) -> str:
113 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})"
115 @property
116 def description(self) -> str:
117 """String description for this pipeline."""
118 return self._description
120 @description.setter
121 def description(self, value: str) -> None:
122 # Docstring in setter.
123 self._description = value
125 @property
126 def universe(self) -> DimensionUniverse | None:
127 """Definitions for all butler dimensions."""
128 return self._universe
130 @property
131 def data_id(self) -> DataCoordinate:
132 """Data ID that represents a constraint on all quanta generated from
133 this pipeline.
135 This is may not be available unless `universe` is not `None`.
136 """
137 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe)
139 @property
140 def tasks(self) -> TaskMappingView:
141 """A mapping view of the tasks in the graph.
143 This mapping has `str` task label keys and `TaskNode` values. Iteration
144 is topologically and deterministically ordered if and only if `sort`
145 has been called since the last modification to the graph.
146 """
147 return self._tasks
149 @property
150 def dataset_types(self) -> DatasetTypeMappingView:
151 """A mapping view of the dataset types in the graph.
153 This mapping has `str` parent dataset type name keys, but only provides
154 access to its `DatasetTypeNode` values if `resolve` has been called
155 since the last modification involving a task that uses a dataset type.
156 See `DatasetTypeMappingView` for details.
157 """
158 return self._dataset_types
160 @property
161 def task_subsets(self) -> Mapping[str, TaskSubset]:
162 """A mapping of all labeled subsets of tasks.
164 Keys are subset labels, values are sets of task labels. See
165 `TaskSubset` for more information.
167 Use `add_task_subset` to add a new subset. The subsets themselves may
168 be modified in-place.
169 """
170 return self._task_subsets
172 @property
173 def is_sorted(self) -> bool:
174 """Whether this graph's tasks and dataset types are topologically
175 sorted with the exact same deterministic tiebreakers that `sort` would
176 apply.
178 This may perform (and then discard) a full sort if `has_been_sorted` is
179 `False`. If the goal is to obtain a sorted graph, it is better to just
180 call `sort` without guarding that with an ``if not graph.is_sorted``
181 check.
182 """
183 if self._sorted_keys is not None:
184 return True
185 return all(
186 sorted == unsorted
187 for sorted, unsorted in zip(
188 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True
189 )
190 )
192 @property
193 def has_been_sorted(self) -> bool:
194 """Whether this graph's tasks and dataset types have been
195 topologically sorted (with unspecified but deterministic tiebreakers)
196 since the last modification to the graph.
198 This may return `False` if the graph *happens* to be sorted but `sort`
199 was never called, but it is potentially much faster than `is_sorted`,
200 which may attempt (and then discard) a full sort if `has_been_sorted`
201 is `False`.
202 """
203 return self._sorted_keys is not None
205 def sort(self) -> None:
206 """Sort this graph's nodes topologically with deterministic (but
207 unspecified) tiebreakers.
209 This does nothing if the graph is already known to be sorted.
210 """
211 if self._sorted_keys is None:
212 try:
213 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph))
214 except networkx.NetworkXUnfeasible as err: # pragma: no cover
215 # Should't be possible to get here, because we check for cycles
216 # when adding tasks, but we guard against it anyway.
217 cycle = networkx.find_cycle(self._xgraph)
218 raise PipelineDataCycleError(
219 f"Cycle detected while attempting to sort graph: {cycle}."
220 ) from err
221 self._reorder(sorted_keys)
223 def copy(self) -> PipelineGraph:
224 """Return a copy of this graph that copies all mutable state."""
225 xgraph = self._xgraph.copy()
226 result = PipelineGraph.__new__(PipelineGraph)
227 result._init_from_args(
228 xgraph,
229 self._sorted_keys,
230 task_subsets={
231 k: TaskSubset(xgraph, v.label, set(v._members), v.description)
232 for k, v in self._task_subsets.items()
233 },
234 description=self._description,
235 universe=self.universe,
236 data_id=self._raw_data_id,
237 )
238 return result
240 def __copy__(self) -> PipelineGraph:
241 # Fully shallow copies are dangerous; we don't want shared mutable
242 # state to lead to broken class invariants.
243 return self.copy()
245 def __deepcopy__(self, memo: dict) -> PipelineGraph:
246 # Genuine deep copies are unnecessary, since we should only ever care
247 # that mutable state is copied.
248 return self.copy()
250 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None:
251 """Return the `WriteEdge` that links the producing task to the named
252 dataset type.
254 Parameters
255 ----------
256 dataset_type_name : `str`
257 Dataset type name. Must not be a component.
259 Returns
260 -------
261 edge : `WriteEdge` or `None`
262 Producing edge or `None` if there isn't one in this graph.
264 Raises
265 ------
266 DuplicateOutputError
267 Raised if there are multiple tasks defined to produce this dataset
268 type. This is only possible if the graph's dataset types are not
269 resolved.
271 Notes
272 -----
273 On resolved graphs, it may be slightly more efficient to use::
275 graph.dataset_types[dataset_type_name].producing_edge
277 but this method works on graphs with unresolved dataset types as well.
278 """
279 producer: str | None = None
280 producing_edge: WriteEdge | None = None
281 for _, _, producing_edge in self._xgraph.in_edges(
282 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
283 ):
284 assert producing_edge is not None, "Should only be None if we never loop."
285 if producer is not None:
286 raise DuplicateOutputError(
287 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} "
288 f"and {producer!r}."
289 )
290 return producing_edge
292 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]:
293 """Return the `ReadEdge` objects that link the named dataset type to
294 the tasks that consume it.
296 Parameters
297 ----------
298 dataset_type_name : `str`
299 Dataset type name. Must not be a component.
301 Returns
302 -------
303 edges : `list` [ `ReadEdge` ]
304 Edges that connect this dataset type to the tasks that consume it.
306 Notes
307 -----
308 On resolved graphs, it may be slightly more efficient to use::
310 graph.dataset_types[dataset_type_name].producing_edges
312 but this method works on graphs with unresolved dataset types as well.
313 """
314 return [
315 edge
316 for _, _, edge in self._xgraph.out_edges(
317 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
318 )
319 ]
321 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None:
322 """Return the `TaskNode` or `TaskInitNode` that writes the given
323 dataset type.
325 Parameters
326 ----------
327 dataset_type_name : `str`
328 Dataset type name. Must not be a component.
330 Returns
331 -------
332 edge : `TaskNode`, `TaskInitNode`, or `None`
333 Producing node or `None` if there isn't one in this graph.
335 Raises
336 ------
337 DuplicateOutputError
338 Raised if there are multiple tasks defined to produce this dataset
339 type. This is only possible if the graph's dataset types are not
340 resolved.
341 """
342 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None:
343 return self._xgraph.nodes[producing_edge.task_key]["instance"]
344 return None
346 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]:
347 """Return the `TaskNode` and/or `TaskInitNode` objects that read
348 the given dataset type.
350 Parameters
351 ----------
352 dataset_type_name : `str`
353 Dataset type name. Must not be a component.
355 Returns
356 -------
357 edges : `list` [ `ReadEdge` ]
358 Edges that connect this dataset type to the tasks that consume it.
360 Notes
361 -----
362 On resolved graphs, it may be slightly more efficient to use::
364 graph.dataset_types[dataset_type_name].producing_edges
366 but this method works on graphs with unresolved dataset types as well.
367 """
368 return [
369 self._xgraph.nodes[consuming_edge.task_key]["instance"]
370 for consuming_edge in self.consuming_edges_of(dataset_type_name)
371 ]
373 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]:
374 """Return the dataset types that are inputs to a task.
376 Parameters
377 ----------
378 task_label : `str`
379 Label for the task in the pipeline.
380 init : `bool`, optional
381 If `True`, return init-input dataset types instead of runtime
382 (including prerequisite) inputs.
384 Returns
385 -------
386 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
387 Dictionary parent dataset type name keys and either
388 `DatasetTypeNode` values (if the dataset type has been resolved)
389 or `None` values.
391 Notes
392 -----
393 To get the input edges of a task or task init node (which provide
394 information about storage class overrides nd components) use::
396 graph.tasks[task_label].iter_all_inputs()
398 or
400 graph.tasks[task_label].init.iter_all_inputs()
402 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
403 class.
404 """
405 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
406 return {
407 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
408 for edge in node.iter_all_inputs()
409 }
411 def outputs_of(
412 self, task_label: str, init: bool = False, include_automatic_connections: bool = True
413 ) -> dict[str, DatasetTypeNode | None]:
414 """Return the dataset types that are outputs of a task.
416 Parameters
417 ----------
418 task_label : `str`
419 Label for the task in the pipeline.
420 init : `bool`, optional
421 If `True`, return init-output dataset types instead of runtime
422 outputs.
423 include_automatic_connections : `bool`, optional
424 Whether to include automatic connections such as configs, metadata,
425 and logs.
427 Returns
428 -------
429 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
430 Dictionary parent dataset type name keys and either
431 `DatasetTypeNode` values (if the dataset type has been resolved)
432 or `None` values.
434 Notes
435 -----
436 To get the input edges of a task or task init node (which provide
437 information about storage class overrides nd components) use::
439 graph.tasks[task_label].iter_all_outputs()
441 or
443 graph.tasks[task_label].init.iter_all_outputs()
445 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
446 class.
447 """
448 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
449 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values()
450 return {
451 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
452 for edge in iterable
453 }
455 def resolve(self, registry: Registry) -> None:
456 """Resolve all dimensions and dataset types and check them for
457 consistency.
459 Resolving a graph also causes it to be sorted.
461 Parameters
462 ----------
463 registry : `lsst.daf.butler.Registry`
464 Client for the data repository to resolve against.
466 Notes
467 -----
468 The `universe` attribute is set to ``registry.dimensions`` and used to
469 set all `TaskNode.dimensions` attributes. Dataset type nodes are
470 resolved by first looking for a registry definition, then using the
471 producing task's definition, then looking for consistency between all
472 consuming task definitions.
474 Raises
475 ------
476 ConnectionTypeConsistencyError
477 Raised if a prerequisite input for one task appears as a different
478 kind of connection in any other task.
479 DuplicateOutputError
480 Raised if multiple tasks have the same dataset type as an output.
481 IncompatibleDatasetTypeError
482 Raised if different tasks have different definitions of a dataset
483 type. Different but compatible storage classes are permitted.
484 MissingDatasetTypeError
485 Raised if a dataset type definition is required to exist in the
486 data repository but none was found. This should only occur for
487 dataset types that are not produced by a task in the pipeline and
488 are consumed with different storage classes or as components by
489 tasks in the pipeline.
490 EdgesChangedError
491 Raised if ``check_edges_unchanged=True`` and the edges of a task do
492 change after import and reconfiguration.
493 """
494 node_key: NodeKey
495 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {}
496 for node_key, node_state in self._xgraph.nodes.items():
497 match node_key.node_type:
498 case NodeType.TASK:
499 task_node: TaskNode = node_state["instance"]
500 new_task_node = task_node._resolved(registry.dimensions)
501 if new_task_node is not task_node:
502 updates[node_key] = new_task_node
503 case NodeType.DATASET_TYPE:
504 dataset_type_node: DatasetTypeNode | None = node_state["instance"]
505 new_dataset_type_node = DatasetTypeNode._from_edges(
506 node_key, self._xgraph, registry, previous=dataset_type_node
507 )
508 # Usage of `is`` here is intentional; `_from_edges` returns
509 # `previous=dataset_type_node` if it can determine that it
510 # doesn't need to change.
511 if new_dataset_type_node is not dataset_type_node:
512 updates[node_key] = new_dataset_type_node
513 try:
514 for node_key, node_value in updates.items():
515 self._xgraph.nodes[node_key]["instance"] = node_value
516 except Exception as err: # pragma: no cover
517 # There's no known way to get here, but we want to make it
518 # clear it's a big problem if we do.
519 raise PipelineGraphExceptionSafetyError(
520 "Error during dataset type resolution has left the graph in an inconsistent state."
521 ) from err
522 self.sort()
523 self._universe = registry.dimensions
525 ###########################################################################
526 #
527 # Graph Modification Interface:
528 #
529 # - methods to add, remove, and replace tasks;
530 #
531 # - methods to add and remove task subsets.
532 #
533 # These are all things that are usually done in a Pipeline before making a
534 # graph at all, but there may be cases where we want to modify the graph
535 # instead. (These are also the methods used to make a graph from a
536 # Pipeline, or make a graph from another graph.)
537 #
538 ###########################################################################
540 def add_task(
541 self,
542 label: str,
543 task_class: type[PipelineTask],
544 config: PipelineTaskConfig,
545 connections: PipelineTaskConnections | None = None,
546 ) -> TaskNode:
547 """Add a new task to the graph.
549 Parameters
550 ----------
551 label : `str`
552 Label for the task in the pipeline.
553 task_class : `type` [ `PipelineTask` ]
554 Class object for the task.
555 config : `PipelineTaskConfig`
556 Configuration for the task.
557 connections : `PipelineTaskConnections`, optional
558 Object that describes the dataset types used by the task. If not
559 provided, one will be constructed from the given configuration. If
560 provided, it is assumed that ``config`` has already been validated
561 and frozen.
563 Returns
564 -------
565 node : `TaskNode`
566 The new task node added to the graph.
568 Raises
569 ------
570 ValueError
571 Raised if configuration validation failed when constructing
572 ``connections``.
573 PipelineDataCycleError
574 Raised if the graph is cyclic after this addition.
575 RuntimeError
576 Raised if an unexpected exception (which will be chained) occurred
577 at a stage that may have left the graph in an inconsistent state.
578 Other exceptions should leave the graph unchanged.
580 Notes
581 -----
582 Checks for dataset type consistency and multiple producers do not occur
583 until `resolve` is called, since the resolution depends on both the
584 state of the data repository and all contributing tasks.
586 Adding new tasks removes any existing resolutions of all dataset types
587 it references and marks the graph as unsorted. It is most effiecient
588 to add all tasks up front and only then resolve and/or sort the graph.
589 """
590 task_node = TaskNode._from_imported_data(
591 key=NodeKey(NodeType.TASK, label),
592 init_key=NodeKey(NodeType.TASK_INIT, label),
593 data=_TaskNodeImportedData.configure(label, task_class, config, connections),
594 universe=self.universe,
595 )
596 self.add_task_nodes([task_node])
597 return task_node
599 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None:
600 """Add one or more existing task nodes to the graph.
602 Parameters
603 ----------
604 nodes : `~collections.abc.Iterable` [ `TaskNode` ]
605 Iterable of task nodes to add. If any tasks have resolved
606 dimensions, they must have the same dimension universe as the rest
607 of the graph.
608 parent : `PipelineGraph`, optional
609 If provided, another `PipelineGraph` from which these nodes were
610 obtained. Any dataset type nodes already present in ``parent``
611 that are referenced by the given tasks will be used in this graph
612 if they are not already present, preserving any dataset type
613 resolutions present in the parent graph. Adding nodes from a
614 parent graph after the graph has its own nodes (e.g. from
615 `add_task`) or nodes from a third graph may result in invalid
616 dataset type resolutions. It is safest to only use this argument
617 when populating an empty graph for the first time.
619 Raises
620 ------
621 PipelineDataCycleError
622 Raised if the graph is cyclic after this addition.
624 Notes
625 -----
626 Checks for dataset type consistency and multiple producers do not occur
627 until `resolve` is called, since the resolution depends on both the
628 state of the data repository and all contributing tasks.
630 Adding new tasks removes any existing resolutions of all dataset types
631 it references (unless ``parent is not None`` and marks the graph as
632 unsorted. It is most efficient to add all tasks up front and only then
633 resolve and/or sort the graph.
634 """
635 node_data: list[tuple[NodeKey, dict[str, Any]]] = []
636 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = []
637 for task_node in nodes:
638 task_node = task_node._resolved(self._universe)
639 node_data.append(
640 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite})
641 )
642 node_data.append(
643 (
644 task_node.init.key,
645 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite},
646 )
647 )
648 # Convert the edge objects attached to the task node to networkx.
649 for read_edge in task_node.init.iter_all_inputs():
650 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
651 for write_edge in task_node.init.iter_all_outputs():
652 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
653 for read_edge in task_node.iter_all_inputs():
654 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
655 for write_edge in task_node.iter_all_outputs():
656 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
657 # Add a special edge (with no Edge instance) that connects the
658 # TaskInitNode to the runtime TaskNode.
659 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None}))
660 if not node_data and not edge_data:
661 return
662 # Checks and preparation complete; time to start the actual
663 # modification, during which it's hard to provide strong exception
664 # safety. Start by resetting the sort ordering, if there is one.
665 self._reset()
666 try:
667 self._xgraph.add_nodes_from(node_data)
668 self._xgraph.add_edges_from(edge_data)
669 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph):
670 cycle = networkx.find_cycle(self._xgraph)
671 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.")
672 except Exception:
673 # First try to roll back our changes.
674 try:
675 self._xgraph.remove_edges_from(edge_data)
676 self._xgraph.remove_nodes_from(key for key, _ in node_data)
677 except Exception as err: # pragma: no cover
678 # There's no known way to get here, but we want to make it
679 # clear it's a big problem if we do.
680 raise PipelineGraphExceptionSafetyError(
681 "Error while attempting to revert PipelineGraph modification has left the graph in "
682 "an inconsistent state."
683 ) from err
684 # Successfully rolled back; raise the original exception.
685 raise
687 def reconfigure_tasks(
688 self,
689 *args: tuple[str, PipelineTaskConfig],
690 check_edges_unchanged: bool = False,
691 assume_edges_unchanged: bool = False,
692 **kwargs: PipelineTaskConfig,
693 ) -> None:
694 """Update the configuration for one or more tasks.
696 Parameters
697 ----------
698 *args : `tuple` [ `str`, `.PipelineTaskConfig` ]
699 Positional arguments are each a 2-tuple of task label and new
700 config object. Note that the same arguments may also be passed as
701 ``**kwargs``, which is usually more readable, but task labels in
702 ``*args`` are not required to be valid Python identifiers.
703 check_edges_unchanged : `bool`, optional
704 If `True`, require the edges (connections) of the modified tasks to
705 remain unchanged after the configuration updates, and verify that
706 this is the case.
707 assume_edges_unchanged : `bool`, optional
708 If `True`, the caller declares that the edges (connections) of the
709 modified tasks will remain unchanged after the configuration
710 updates, and that it is unnecessary to check this.
711 **kwargs : `.PipelineTaskConfig`
712 New config objects or overrides to apply to copies of the current
713 config objects, with task labels as the keywords.
715 Raises
716 ------
717 ValueError
718 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
719 are both `True`, or if the same task appears twice.
720 EdgesChangedError
721 Raised if ``check_edges_unchanged=True`` and the edges of a task do
722 change.
724 Notes
725 -----
726 If reconfiguring a task causes its edges to change, any dataset type
727 nodes connected to that task (not just those whose edges have changed!)
728 will be unresolved.
729 """
730 new_configs: dict[str, PipelineTaskConfig] = {}
731 for task_label, config_update in itertools.chain(args, kwargs.items()):
732 if new_configs.setdefault(task_label, config_update) is not config_update:
733 raise ValueError(f"Config for {task_label!r} provided more than once.")
734 updates = {
735 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged)
736 for task_label, config in new_configs.items()
737 }
738 self._replace_task_nodes(
739 updates,
740 check_edges_unchanged=check_edges_unchanged,
741 assume_edges_unchanged=assume_edges_unchanged,
742 message_header=(
743 "Unexpected change in edges for task {task_label!r} from original config (A) to "
744 "new configs (B):"
745 ),
746 )
748 def remove_tasks(
749 self, labels: Iterable[str], drop_from_subsets: bool = True
750 ) -> list[tuple[TaskNode, set[str]]]:
751 """Remove one or more tasks from the graph.
753 Parameters
754 ----------
755 labels : `~collections.abc.Iterable` [ `str` ]
756 Iterable of the labels of the tasks to remove.
757 drop_from_subsets : `bool`, optional
758 If `True`, drop each removed task from any subset in which it
759 currently appears. If `False`, raise `PipelineGraphError` if any
760 such subsets exist.
762 Returns
763 -------
764 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ]
765 List of nodes removed and the labels of task subsets that
766 referenced them.
768 Raises
769 ------
770 PipelineGraphError
771 Raised if ``drop_from_subsets`` is `False` and the task is still
772 part of one or more subsets.
774 Notes
775 -----
776 Removing a task will cause dataset nodes with no other referencing
777 tasks to be removed. Any other dataset type nodes referenced by a
778 removed task will be reset to an "unresolved" state.
779 """
780 task_nodes_and_subsets = []
781 dataset_types: set[NodeKey] = set()
782 nodes_to_remove = set()
783 for label in labels:
784 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"]
785 # Find task subsets that reference this task.
786 referencing_subsets = {
787 subset_label
788 for subset_label, task_subset in self.task_subsets.items()
789 if label in task_subset
790 }
791 if not drop_from_subsets and referencing_subsets:
792 raise PipelineGraphError(
793 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}."
794 )
795 task_nodes_and_subsets.append((task_node, referencing_subsets))
796 # Find dataset types referenced by this task.
797 dataset_types.update(self._xgraph.predecessors(task_node.key))
798 dataset_types.update(self._xgraph.successors(task_node.key))
799 dataset_types.update(self._xgraph.predecessors(task_node.init.key))
800 dataset_types.update(self._xgraph.successors(task_node.init.key))
801 # Since there's an edge between the task and its init node, we'll
802 # have added those two nodes here, too, and we don't want that.
803 dataset_types.remove(task_node.init.key)
804 dataset_types.remove(task_node.key)
805 # Mark the task node and its init node for removal from the graph.
806 nodes_to_remove.add(task_node.key)
807 nodes_to_remove.add(task_node.init.key)
808 # Process the referenced datasets to see which ones are orphaned and
809 # need to be removed vs. just unresolved.
810 nodes_to_unresolve = []
811 for dataset_type_key in dataset_types:
812 related_tasks = set()
813 related_tasks.update(self._xgraph.predecessors(dataset_type_key))
814 related_tasks.update(self._xgraph.successors(dataset_type_key))
815 related_tasks.difference_update(nodes_to_remove)
816 if not related_tasks:
817 nodes_to_remove.add(dataset_type_key)
818 else:
819 nodes_to_unresolve.append(dataset_type_key)
820 # Checks and preparation complete; time to start the actual
821 # modification, during which it's hard to provide strong exception
822 # safety. Start by resetting the sort ordering.
823 self._reset()
824 try:
825 for dataset_type_key in nodes_to_unresolve:
826 self._xgraph.nodes[dataset_type_key]["instance"] = None
827 for task_node, referencing_subsets in task_nodes_and_subsets:
828 for subset_label in referencing_subsets:
829 self._task_subsets[subset_label].remove(task_node.label)
830 self._xgraph.remove_nodes_from(nodes_to_remove)
831 except Exception as err: # pragma: no cover
832 # There's no known way to get here, but we want to make it
833 # clear it's a big problem if we do.
834 raise PipelineGraphExceptionSafetyError(
835 "Error during task removal has left the graph in an inconsistent state."
836 ) from err
837 return task_nodes_and_subsets
839 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None:
840 """Add a label for a set of tasks that are already in the pipeline.
842 Parameters
843 ----------
844 subset_label : `str`
845 Label for this set of tasks.
846 task_labels : `~collections.abc.Iterable` [ `str` ]
847 Labels of the tasks to include in the set. All must already be
848 included in the graph.
849 description : `str`, optional
850 String description to associate with this label.
851 """
852 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description)
853 self._task_subsets[subset_label] = subset
855 def remove_task_subset(self, subset_label: str) -> None:
856 """Remove a labeled set of tasks."""
857 del self._task_subsets[subset_label]
859 ###########################################################################
860 #
861 # NetworkX Export Interface:
862 #
863 # - methods to export the PipelineGraph's content (or various subsets
864 # thereof) as NetworkX objects.
865 #
866 # These are particularly useful when writing tools to visualize the graph,
867 # while providing options for which aspects of the graph (tasks, dataset
868 # types, or both) to include, since all exported graphs have similar
869 # attributes regardless of their structure.
870 #
871 ###########################################################################
873 def make_xgraph(self) -> networkx.MultiDiGraph:
874 """Export a networkx representation of the full pipeline graph,
875 including both init and runtime edges.
877 Returns
878 -------
879 xgraph : `networkx.MultiDiGraph`
880 Directed acyclic graph with parallel edges.
882 Notes
883 -----
884 The returned graph uses `NodeKey` instances for nodes. Parallel edges
885 represent the same dataset type appearing in multiple connections for
886 the same task, and are hence rare. The connection name is used as the
887 edge key to disambiguate those parallel edges.
889 Almost all edges connect dataset type nodes to task or task init nodes
890 or vice versa, but there is also a special edge that connects each task
891 init node to its runtime node. The existence of these edges makes the
892 graph not quite bipartite, though its init-only and runtime-only
893 subgraphs are bipartite.
895 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
896 `WriteEdge` for the descriptive node and edge attributes added.
897 """
898 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False)
900 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph:
901 """Return a bipartite networkx representation of just the runtime or
902 init-time pipeline graph.
904 Parameters
905 ----------
906 init : `bool`, optional
907 If `True` (`False` is default) return the graph of task
908 initialization nodes and init input/output dataset types, instead
909 of the graph of runtime task nodes and regular
910 input/output/prerequisite dataset types.
912 Returns
913 -------
914 xgraph : `networkx.MultiDiGraph`
915 Directed acyclic graph with parallel edges.
917 Notes
918 -----
919 The returned graph uses `NodeKey` instances for nodes. Parallel edges
920 represent the same dataset type appearing in multiple connections for
921 the same task, and are hence rare. The connection name is used as the
922 edge key to disambiguate those parallel edges.
924 This graph is bipartite because each dataset type node only has edges
925 that connect it to a task [init] node, and vice versa.
927 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
928 `WriteEdge` for the descriptive node and edge attributes added.
929 """
930 return self._transform_xgraph_state(
931 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False
932 )
934 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph:
935 """Return a networkx representation of just the tasks in the pipeline.
937 Parameters
938 ----------
939 init : `bool`, optional
940 If `True` (`False` is default) return the graph of task
941 initialization nodes, instead of the graph of runtime task nodes.
943 Returns
944 -------
945 xgraph : `networkx.DiGraph`
946 Directed acyclic graph with no parallel edges.
948 Notes
949 -----
950 The returned graph uses `NodeKey` instances for nodes. The dataset
951 types that link these tasks are not represented at all; edges have no
952 attributes, and there are no parallel edges.
954 See `TaskNode` and `TaskInitNode` for the descriptive node and
955 attributes added.
956 """
957 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
958 task_keys = [
959 key
960 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
961 if bipartite == NodeType.TASK.bipartite
962 ]
963 return self._transform_xgraph_state(
964 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys),
965 skip_edges=True,
966 )
968 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph:
969 """Return a networkx representation of just the dataset types in the
970 pipeline.
972 Parameters
973 ----------
974 init : `bool`, optional
975 If `True` (`False` is default) return the graph of init input and
976 output dataset types, instead of the graph of runtime (input,
977 output, prerequisite input) dataset types.
979 Returns
980 -------
981 xgraph : `networkx.DiGraph`
982 Directed acyclic graph with no parallel edges.
984 Notes
985 -----
986 The returned graph uses `NodeKey` instances for nodes. The tasks that
987 link these tasks are not represented at all; edges have no attributes,
988 and there are no parallel edges.
990 See `DatasetTypeNode` for the descriptive node and attributes added.
991 """
992 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
993 dataset_type_keys = [
994 key
995 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
996 if bipartite == NodeType.DATASET_TYPE.bipartite
997 ]
998 return self._transform_xgraph_state(
999 networkx.algorithms.bipartite.projected_graph(
1000 networkx.DiGraph(bipartite_xgraph), dataset_type_keys
1001 ),
1002 skip_edges=True,
1003 )
1005 ###########################################################################
1006 #
1007 # Serialization Interface.
1008 #
1009 # Serialization of PipelineGraphs is currently experimental and may not be
1010 # retained in the future. All serialization methods are
1011 # underscore-prefixed to ensure nobody mistakes them for a stable interface
1012 # (let a lone a stable file format).
1013 #
1014 ###########################################################################
1016 @classmethod
1017 def _read_stream(
1018 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1019 ) -> PipelineGraph:
1020 """Read a serialized `PipelineGraph` from a file-like object.
1022 Parameters
1023 ----------
1024 stream : `BinaryIO`
1025 File-like object opened for binary reading, containing
1026 gzip-compressed JSON.
1027 import_mode : `TaskImportMode`, optional
1028 Whether to import tasks, and how to reconcile any differences
1029 between the imported task's connections and the those that were
1030 persisted with the graph. Default is to check that they are the
1031 same.
1033 Returns
1034 -------
1035 graph : `PipelineGraph`
1036 Deserialized pipeline graph.
1038 Raises
1039 ------
1040 PipelineGraphReadError
1041 Raised if the serialized `PipelineGraph` is not self-consistent.
1042 EdgesChangedError
1043 Raised if ``import_mode`` is
1044 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1045 did change after import and reconfiguration.
1047 Notes
1048 -----
1049 `PipelineGraph` serialization is currently experimental and may be
1050 removed or significantly changed in the future, with no deprecation
1051 period.
1052 """
1053 from .io import SerializedPipelineGraph
1055 with gzip.open(stream, "rb") as uncompressed_stream:
1056 data = json.load(uncompressed_stream)
1057 serialized_graph = SerializedPipelineGraph.parse_obj(data)
1058 return serialized_graph.deserialize(import_mode)
1060 @classmethod
1061 def _read_uri(
1062 cls,
1063 uri: ResourcePathExpression,
1064 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES,
1065 ) -> PipelineGraph:
1066 """Read a serialized `PipelineGraph` from a file at a URI.
1068 Parameters
1069 ----------
1070 uri : convertible to `lsst.resources.ResourcePath`
1071 URI to a gzip-compressed JSON file containing a serialized pipeline
1072 graph.
1073 import_mode : `TaskImportMode`, optional
1074 Whether to import tasks, and how to reconcile any differences
1075 between the imported task's connections and the those that were
1076 persisted with the graph. Default is to check that they are the
1077 same.
1079 Returns
1080 -------
1081 graph : `PipelineGraph`
1082 Deserialized pipeline graph.
1084 Raises
1085 ------
1086 PipelineGraphReadError
1087 Raised if the serialized `PipelineGraph` is not self-consistent.
1088 EdgesChangedError
1089 Raised if ``import_mode`` is
1090 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1091 did change after import and reconfiguration.
1093 Notes
1094 -----
1095 `PipelineGraph` serialization is currently experimental and may be
1096 removed or significantly changed in the future, with no deprecation
1097 period.
1098 """
1099 uri = ResourcePath(uri)
1100 with uri.open("rb") as stream:
1101 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode)
1103 def _write_stream(self, stream: BinaryIO) -> None:
1104 """Write the pipeline to a file-like object.
1106 Parameters
1107 ----------
1108 stream
1109 File-like object opened for binary writing.
1111 Notes
1112 -----
1113 `PipelineGraph` serialization is currently experimental and may be
1114 removed or significantly changed in the future, with no deprecation
1115 period.
1117 The file format is gzipped JSON, and is intended to be human-readable,
1118 but it should not be considered a stable public interface for outside
1119 code, which should always use `PipelineGraph` methods (or at least the
1120 `io.SerializedPipelineGraph` class) to read these files.
1121 """
1122 from .io import SerializedPipelineGraph
1124 with gzip.open(stream, mode="wb") as compressed_stream:
1125 compressed_stream.write(
1126 SerializedPipelineGraph.serialize(self).json(exclude_defaults=True).encode("utf-8")
1127 )
1129 def _write_uri(self, uri: ResourcePathExpression) -> None:
1130 """Write the pipeline to a file given a URI.
1132 Parameters
1133 ----------
1134 uri : convertible to `lsst.resources.ResourcePath`
1135 URI to write to . May have ``.json.gz`` or no extension (which
1136 will cause a ``.json.gz`` extension to be added).
1138 Notes
1139 -----
1140 `PipelineGraph` serialization is currently experimental and may be
1141 removed or significantly changed in the future, with no deprecation
1142 period.
1144 The file format is gzipped JSON, and is intended to be human-readable,
1145 but it should not be considered a stable public interface for outside
1146 code, which should always use `PipelineGraph` methods (or at least the
1147 `io.SerializedPipelineGraph` class) to read these files.
1148 """
1149 uri = ResourcePath(uri)
1150 extension = uri.getExtension()
1151 if not extension:
1152 uri = uri.updatedExtension(".json.gz")
1153 elif extension != ".json.gz":
1154 raise ValueError("Expanded pipeline files should always have a .json.gz extension.")
1155 with uri.open(mode="wb") as stream:
1156 self._write_stream(cast(BinaryIO, stream))
1158 def _import_and_configure(
1159 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1160 ) -> None:
1161 """Import the `PipelineTask` classes referenced by all task nodes and
1162 update those nodes accordingly.
1164 Parameters
1165 ----------
1166 import_mode : `TaskImportMode`, optional
1167 Whether to import tasks, and how to reconcile any differences
1168 between the imported task's connections and the those that were
1169 persisted with the graph. Default is to check that they are the
1170 same. This method does nothing if this is
1171 `TaskImportMode.DO_NOT_IMPORT`.
1173 Raises
1174 ------
1175 EdgesChangedError
1176 Raised if ``import_mode`` is
1177 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1178 did change after import and reconfiguration.
1180 Notes
1181 -----
1182 This method shouldn't need to be called unless the graph was
1183 deserialized without importing and configuring immediately, which is
1184 not the default behavior (but it can greatly speed up deserialization).
1185 If all tasks have already been imported this does nothing.
1187 Importing and configuring a task can change its
1188 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output,
1189 usually because the software used to read a serialized graph is newer
1190 than the software used to write it (e.g. a new config option has been
1191 added, or the task was moved to a new module with a forwarding alias
1192 left behind). These changes are allowed by
1193 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`.
1195 If importing and configuring a task causes its edges to change, any
1196 dataset type nodes linked to those edges will be reset to the
1197 unresolved state.
1198 """
1199 if import_mode is TaskImportMode.DO_NOT_IMPORT:
1200 return
1201 rebuild = (
1202 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES
1203 or import_mode is TaskImportMode.OVERRIDE_EDGES
1204 )
1205 updates: dict[str, TaskNode] = {}
1206 node_key: NodeKey
1207 for node_key, node_state in self._xgraph.nodes.items():
1208 if node_key.node_type is NodeType.TASK:
1209 task_node: TaskNode = node_state["instance"]
1210 new_task_node = task_node._imported_and_configured(rebuild)
1211 if new_task_node is not task_node:
1212 updates[task_node.label] = new_task_node
1213 self._replace_task_nodes(
1214 updates,
1215 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES),
1216 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES),
1217 message_header=(
1218 "In task with label {task_label!r}, persisted edges (A)"
1219 "differ from imported and configured edges (B):"
1220 ),
1221 )
1223 ###########################################################################
1224 #
1225 # Advanced PipelineGraph Inspection Interface:
1226 #
1227 # - methods to iterate over all nodes and edges, utilizing NodeKeys;
1228 #
1229 # - methods to find overall inputs and group nodes by their dimensions,
1230 # which are important operations for QuantumGraph generation.
1231 #
1232 ###########################################################################
1234 def iter_edges(self, init: bool = False) -> Iterator[Edge]:
1235 """Iterate over edges in the graph.
1237 Parameters
1238 ----------
1239 init : `bool`, optional
1240 If `True` (`False` is default) iterate over the edges between task
1241 initialization node and init input/output dataset types, instead of
1242 the runtime task nodes and regular input/output/prerequisite
1243 dataset types.
1245 Returns
1246 -------
1247 edges : `~collections.abc.Iterator` [ `Edge` ]
1248 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances.
1250 Notes
1251 -----
1252 This method always returns _either_ init edges or runtime edges, never
1253 both. The full (internal) graph that contains both also includes a
1254 special edge that connects each task init node to its runtime node;
1255 that is also never returned by this method, since it is never a part of
1256 the init-only or runtime-only subgraphs.
1257 """
1258 edge: Edge
1259 for _, _, edge in self._xgraph.edges(data="instance"):
1260 if edge is not None and edge.is_init == init:
1261 yield edge
1263 def iter_nodes(
1264 self,
1265 ) -> Iterator[
1266 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode]
1267 | tuple[Literal[NodeType.TASK], str, TaskInitNode]
1268 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None]
1269 ]:
1270 """Iterate over nodes in the graph.
1272 Returns
1273 -------
1274 nodes : `~collections.abc.Iterator` [ `tuple` ]
1275 A lazy iterator over all of the nodes in the graph. Each yielded
1276 element is a tuple of:
1278 - the node type enum value (`NodeType`);
1279 - the string name for the node (task label or parent dataset type
1280 name);
1281 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`,
1282 or `None` for dataset type nodes that have not been resolved).
1283 """
1284 key: NodeKey
1285 if self._sorted_keys is not None:
1286 for key in self._sorted_keys:
1287 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore
1288 else:
1289 for key, node in self._xgraph.nodes(data="instance"):
1290 yield key.node_type, key.name, node # type: ignore
1292 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]:
1293 """Iterate over all of the dataset types that are consumed but not
1294 produced by the graph.
1296 Returns
1297 -------
1298 dataset_types : `~collections.abc.Iterator` [ `tuple` ]
1299 A lazy iterator over the overall-input dataset types (including
1300 overall init inputs and prerequisites). Each yielded element is a
1301 tuple of:
1303 - the parent dataset type name;
1304 - the resolved `DatasetTypeNode`, or `None` if the dataset type has
1305 - not been resolved.
1306 """
1307 for generation in networkx.algorithms.dag.topological_generations(self._xgraph):
1308 key: NodeKey
1309 for key in generation:
1310 # While we expect all tasks to have at least one input and
1311 # hence never appear in the first topological generation, that
1312 # is not true of task init nodes.
1313 if key.node_type is NodeType.DATASET_TYPE:
1314 yield key.name, self._xgraph.nodes[key]["instance"]
1315 return
1317 def group_by_dimensions(
1318 self, prerequisites: bool = False
1319 ) -> dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]:
1320 """Group this graph's tasks and dataset types by their dimensions.
1322 Parameters
1323 ----------
1324 prerequisites : `bool`, optional
1325 If `True`, include prerequisite dataset types as well as regular
1326 input and output datasets (including intermediates).
1328 Returns
1329 -------
1330 groups : `dict` [ `DimensionGraph`, `tuple` ]
1331 A dictionary of groups keyed by `DimensionGraph`, in which each
1332 value is a tuple of:
1334 - a `dict` of `TaskNode` instances, keyed by task label
1335 - a `dict` of `DatasetTypeNode` instances, keyed by
1336 dataset type name.
1338 that have those dimensions.
1340 Notes
1341 -----
1342 Init inputs and outputs are always included, but always have empty
1343 dimensions and are hence are all grouped together.
1344 """
1345 result: dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {}
1346 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {})
1347 for task_label, task_node in self.tasks.items():
1348 if task_node.dimensions is None:
1349 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.")
1350 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value:
1351 next_new_value = ({}, {}) # make new lists for next time
1352 group[0][task_node.label] = task_node
1353 for dataset_type_name, dataset_type_node in self.dataset_types.items():
1354 if dataset_type_node is None:
1355 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.")
1356 if not dataset_type_node.is_prerequisite or prerequisites:
1357 if (
1358 group := result.setdefault(dataset_type_node.dataset_type.dimensions, next_new_value)
1359 ) is next_new_value:
1360 next_new_value = ({}, {}) # make new lists for next time
1361 group[1][dataset_type_node.name] = dataset_type_node
1362 return result
1364 def split_independent(self) -> Iterable[PipelineGraph]:
1365 """Iterate over independent subgraphs that together comprise this
1366 pipeline graph.
1368 Returns
1369 -------
1370 subgraphs : `Iterable` [ `PipelineGraph` ]
1371 An iterable over component subgraphs that could be run
1372 independently (they have only overall inputs in common). May be a
1373 lazy iterator.
1375 Notes
1376 -----
1377 All resolved dataset type nodes will be preserved.
1379 If there is only one component, ``self`` may be returned as the only
1380 element in the iterable.
1382 If `has_been_sorted`, all subgraphs will be sorted as well.
1383 """
1384 # Having an overall input in common isn't enough to make subgraphs
1385 # dependent on each other, so we want to look for connected component
1386 # subgraphs of the task-only projected graph.
1387 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False)
1388 task_keys = {
1389 key
1390 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1391 if bipartite == NodeType.TASK.bipartite
1392 }
1393 task_xgraph = networkx.algorithms.bipartite.projected_graph(
1394 networkx.DiGraph(bipartite_xgraph), task_keys
1395 )
1396 # "Weakly" connected means connected in only one direction, which is
1397 # the only kind of "connected" a DAG can ever be.
1398 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph):
1399 if component_task_keys == task_keys:
1400 yield self
1401 return
1402 else:
1403 component_subgraph = PipelineGraph(universe=self._universe)
1404 component_subgraph.add_task_nodes(
1405 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self
1406 )
1407 if self.has_been_sorted:
1408 component_subgraph.sort()
1409 yield component_subgraph
1411 ###########################################################################
1412 #
1413 # Class- and Package-Private Methods.
1414 #
1415 ###########################################################################
1417 def _iter_task_defs(self) -> Iterator[TaskDef]:
1418 """Iterate over this pipeline as a sequence of `TaskDef` instances.
1420 Notes
1421 -----
1422 This is a package-private method intended to aid in the transition to a
1423 codebase more fully integrated with the `PipelineGraph` class, in which
1424 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and
1425 much of the functionality on the `Pipeline` class will be moved to
1426 `PipelineGraph` as well.
1428 Raises
1429 ------
1430 TaskNotImportedError
1431 Raised if `TaskNode.is_imported` is `False` for any task.
1432 """
1433 from ..pipeline import TaskDef
1435 for node in self._tasks.values():
1436 yield TaskDef(
1437 config=node.config,
1438 taskClass=node.task_class,
1439 label=node.label,
1440 connections=node._get_imported_data().connections,
1441 )
1443 def _init_from_args(
1444 self,
1445 xgraph: networkx.MultiDiGraph | None,
1446 sorted_keys: Sequence[NodeKey] | None,
1447 task_subsets: dict[str, TaskSubset] | None,
1448 description: str,
1449 universe: DimensionUniverse | None,
1450 data_id: DataId | None,
1451 ) -> None:
1452 """Initialize the graph with possibly-nontrivial arguments.
1454 Parameters
1455 ----------
1456 xgraph : `networkx.MultiDiGraph` or `None`
1457 The backing networkx graph, or `None` to create an empty one.
1458 This graph has `NodeKey` instances for nodes and the same structure
1459 as the graph exported by `make_xgraph`, but its nodes and edges
1460 have a single ``instance`` attribute that holds a `TaskNode`,
1461 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or
1462 `WriteEdge` instance.
1463 sorted_keys : `Sequence` [ `NodeKey` ] or `None`
1464 Topologically sorted sequence of node keys, or `None` if the graph
1465 is not sorted.
1466 task_subsets : `dict` [ `str`, `TaskSubset` ]
1467 Labeled subsets of tasks. Values must be constructed with
1468 ``xgraph`` as their parent graph.
1469 description : `str`
1470 String description for this pipeline.
1471 universe : `lsst.daf.butler.DimensionUniverse` or `None`
1472 Definitions of all dimensions.
1473 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping.
1474 Data ID that represents a constraint on all quanta generated from
1475 this pipeline.
1477 Notes
1478 -----
1479 Only empty `PipelineGraph` instances should be constructed directly by
1480 users, which sets the signature of ``__init__`` itself, but methods on
1481 `PipelineGraph` and its helper classes need to be able to create them
1482 with state. Those methods can call this after calling ``__new__``
1483 manually, skipping ``__init__``.
1484 """
1485 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph()
1486 self._sorted_keys: Sequence[NodeKey] | None = None
1487 self._task_subsets = task_subsets if task_subsets is not None else {}
1488 self._description = description
1489 self._tasks = TaskMappingView(self._xgraph)
1490 self._dataset_types = DatasetTypeMappingView(self._xgraph)
1491 self._raw_data_id: dict[str, Any]
1492 if isinstance(data_id, DataCoordinate):
1493 if universe is None:
1494 universe = data_id.universe
1495 else:
1496 assert universe is data_id.universe, "data_id.universe and given universe differ"
1497 self._raw_data_id = data_id.byName()
1498 elif data_id is None:
1499 self._raw_data_id = {}
1500 else:
1501 self._raw_data_id = dict(data_id)
1502 self._universe = universe
1503 if sorted_keys is not None:
1504 self._reorder(sorted_keys)
1506 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph:
1507 """Make a bipartite init-only or runtime-only internal subgraph.
1509 See `make_bipartite_xgraph` for parameters and return values.
1511 Notes
1512 -----
1513 This method returns a view of the `PipelineGraph` object's internal
1514 backing graph, and hence should only be called in methods that copy the
1515 result either explicitly or by running a copying algorithm before
1516 returning it to the user.
1517 """
1518 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)])
1520 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G:
1521 """Transform networkx graph attributes in-place from the internal
1522 "instance" attributes to the documented exported attributes.
1524 Parameters
1525 ----------
1526 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1527 Graph whose state should be transformed.
1528 skip_edges : `bool`
1529 If `True`, do not transform edge state.
1531 Returns
1532 -------
1533 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1534 The same object passed in, after modification.
1536 Notes
1537 -----
1538 This should be called after making a copy of the internal graph but
1539 before any projection down to just task or dataset type nodes, since
1540 it assumes stateful edges.
1541 """
1542 state: dict[str, Any]
1543 for state in xgraph.nodes.values():
1544 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance")
1545 if node_value is not None:
1546 state.update(node_value._to_xgraph_state())
1547 if not skip_edges:
1548 for _, _, state in xgraph.edges(data=True):
1549 edge: Edge | None = state.pop("instance", None)
1550 if edge is not None:
1551 state.update(edge._to_xgraph_state())
1552 return xgraph
1554 def _replace_task_nodes(
1555 self,
1556 updates: Mapping[str, TaskNode],
1557 check_edges_unchanged: bool,
1558 assume_edges_unchanged: bool,
1559 message_header: str,
1560 ) -> None:
1561 """Replace task nodes and update edges and dataset type nodes
1562 accordingly.
1564 Parameters
1565 ----------
1566 updates : `Mapping` [ `str`, `TaskNode` ]
1567 New task nodes with task label keys. All keys must be task labels
1568 that are already present in the graph.
1569 check_edges_unchanged : `bool`, optional
1570 If `True`, require the edges (connections) of the modified tasks to
1571 remain unchanged after importing and configuring each task, and
1572 verify that this is the case.
1573 assume_edges_unchanged : `bool`, optional
1574 If `True`, the caller declares that the edges (connections) of the
1575 modified tasks will remain unchanged importing and configuring each
1576 task, and that it is unnecessary to check this.
1577 message_header : `str`
1578 Template for `str.format` with a single ``task_label`` placeholder
1579 to use as the first line in `EdgesChangedError` messages that show
1580 the differences between new task edges and old task edges. Should
1581 include the fact that the rest of the message will refer to the old
1582 task as "A" and the new task as "B", and end with a colon.
1584 Raises
1585 ------
1586 ValueError
1587 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
1588 are both `True`, or if a full config is provided for a task after
1589 another full config or an override has already been provided.
1590 EdgesChangedError
1591 Raised if ``check_edges_unchanged=True`` and the edges of a task do
1592 change.
1593 """
1594 deep: dict[str, TaskNode] = {}
1595 shallow: dict[str, TaskNode] = {}
1596 if assume_edges_unchanged:
1597 if check_edges_unchanged:
1598 raise ValueError("Cannot simultaneously assume and check that edges have not changed.")
1599 shallow.update(updates)
1600 else:
1601 for task_label, new_task_node in updates.items():
1602 old_task_node = self.tasks[task_label]
1603 messages = old_task_node.diff_edges(new_task_node)
1604 if messages:
1605 if check_edges_unchanged:
1606 messages.insert(0, message_header.format(task_label=task_label))
1607 raise EdgesChangedError("\n".join(messages))
1608 else:
1609 deep[task_label] = new_task_node
1610 else:
1611 shallow[task_label] = new_task_node
1612 try:
1613 if deep:
1614 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True)
1615 self.add_task_nodes(deep.values())
1616 for replaced_task_node, referencing_subsets in removed:
1617 for subset_label in referencing_subsets:
1618 self._task_subsets[subset_label].add(replaced_task_node.label)
1619 for task_node in shallow.values():
1620 self._xgraph.nodes[task_node.key]["instance"] = task_node
1621 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init
1622 except PipelineGraphExceptionSafetyError: # pragma: no cover
1623 raise
1624 except Exception as err: # pragma: no cover
1625 # There's no known way to get here, but we want to make it clear
1626 # it's a big problem if we do.
1627 raise PipelineGraphExceptionSafetyError(
1628 "Error while replacing tasks has left the graph in an inconsistent state."
1629 ) from err
1631 def _append_graph_data_from_edge(
1632 self,
1633 node_data: list[tuple[NodeKey, dict[str, Any]]],
1634 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]],
1635 edge: Edge,
1636 parent: PipelineGraph | None,
1637 ) -> None:
1638 """Append networkx state dictionaries for an edge and the corresponding
1639 dataset type node.
1641 Parameters
1642 ----------
1643 node_data : `list`
1644 List of node keys and state dictionaries. A node is appended if
1645 one does not already exist for this dataset type.
1646 edge_data : `list`
1647 List of node key pairs, connection names, and state dictionaries
1648 for edges.
1649 edge : `Edge`
1650 New edge being processed.
1651 parent : `PipelineGraph` or `None`
1652 Another pipeline graph whose dataset type nodes should be used
1653 when present.
1654 """
1655 new_dataset_type_node = None
1656 if parent is not None:
1657 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance")
1658 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None:
1659 existing_dataset_type_state["instance"] = new_dataset_type_node
1660 else:
1661 node_data.append(
1662 (
1663 edge.dataset_type_key,
1664 {
1665 "instance": new_dataset_type_node,
1666 "bipartite": NodeType.DATASET_TYPE.bipartite,
1667 },
1668 )
1669 )
1670 edge_data.append(
1671 edge.nodes
1672 + (
1673 edge.connection_name,
1674 {"instance": edge},
1675 )
1676 )
1678 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None:
1679 """Set the order of all views of this graph from the given sorted
1680 sequence of task labels and dataset type names.
1681 """
1682 self._sorted_keys = sorted_keys
1683 self._tasks._reorder(sorted_keys)
1684 self._dataset_types._reorder(sorted_keys)
1686 def _reset(self) -> None:
1687 """Reset the all views of this graph following a modification that
1688 might invalidate them.
1689 """
1690 self._sorted_keys = None
1691 self._tasks._reset()
1692 self._dataset_types._reset()
1694 _xgraph: networkx.MultiDiGraph
1695 _sorted_keys: Sequence[NodeKey] | None
1696 _task_subsets: dict[str, TaskSubset]
1697 _description: str
1698 _tasks: TaskMappingView
1699 _dataset_types: DatasetTypeMappingView
1700 _raw_data_id: dict[str, Any]
1701 _universe: DimensionUniverse | None