Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 20%
357 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-23 10:31 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-23 10:31 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("PipelineGraph",)
25import gzip
26import itertools
27import json
28from collections.abc import Iterable, Iterator, Mapping, Sequence
29from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast
31import networkx
32import networkx.algorithms.bipartite
33import networkx.algorithms.dag
34from lsst.daf.butler import DataCoordinate, DataId, DimensionGraph, DimensionUniverse, Registry
35from lsst.resources import ResourcePath, ResourcePathExpression
37from ._dataset_types import DatasetTypeNode
38from ._edges import Edge, ReadEdge, WriteEdge
39from ._exceptions import (
40 DuplicateOutputError,
41 EdgesChangedError,
42 PipelineDataCycleError,
43 PipelineGraphError,
44 PipelineGraphExceptionSafetyError,
45 UnresolvedGraphError,
46)
47from ._mapping_views import DatasetTypeMappingView, TaskMappingView
48from ._nodes import NodeKey, NodeType
49from ._task_subsets import TaskSubset
50from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData
52if TYPE_CHECKING:
53 from ..config import PipelineTaskConfig
54 from ..connections import PipelineTaskConnections
55 from ..pipeline import TaskDef
56 from ..pipelineTask import PipelineTask
59_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph)
62class PipelineGraph:
63 """A graph representation of fully-configured pipeline.
65 `PipelineGraph` instances are typically constructed by calling
66 `.Pipeline.to_graph`, but in rare cases constructing and then populating an
67 empty one may be preferable.
69 Parameters
70 ----------
71 description : `str`, optional
72 String description for this pipeline.
73 universe : `lsst.daf.butler.DimensionUniverse`, optional
74 Definitions for all butler dimensions. If not provided, some
75 attributes will not be available until `resolve` is called.
76 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional
77 Data ID that represents a constraint on all quanta generated by this
78 pipeline. This typically just holds the instrument constraint included
79 in the pipeline definition, if there was one.
80 """
82 ###########################################################################
83 #
84 # Simple Pipeline Graph Inspection Interface:
85 #
86 # - for inspecting graph structure, not modifying it (except to sort and]
87 # resolve);
88 #
89 # - no NodeKey objects, just string dataset type name and task label keys;
90 #
91 # - graph structure is represented as a pair of mappings, with methods to
92 # find neighbors and edges of nodes.
93 #
94 ###########################################################################
96 def __init__(
97 self,
98 *,
99 description: str = "",
100 universe: DimensionUniverse | None = None,
101 data_id: DataId | None = None,
102 ) -> None:
103 self._init_from_args(
104 xgraph=None,
105 sorted_keys=None,
106 task_subsets=None,
107 description=description,
108 universe=universe,
109 data_id=data_id,
110 )
112 def __repr__(self) -> str:
113 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})"
115 @property
116 def description(self) -> str:
117 """String description for this pipeline."""
118 return self._description
120 @description.setter
121 def description(self, value: str) -> None:
122 # Docstring in setter.
123 self._description = value
125 @property
126 def universe(self) -> DimensionUniverse | None:
127 """Definitions for all butler dimensions."""
128 return self._universe
130 @property
131 def data_id(self) -> DataCoordinate:
132 """Data ID that represents a constraint on all quanta generated from
133 this pipeline.
135 This is may not be available unless `universe` is not `None`.
136 """
137 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe)
139 @property
140 def tasks(self) -> TaskMappingView:
141 """A mapping view of the tasks in the graph.
143 This mapping has `str` task label keys and `TaskNode` values. Iteration
144 is topologically and deterministically ordered if and only if `sort`
145 has been called since the last modification to the graph.
146 """
147 return self._tasks
149 @property
150 def dataset_types(self) -> DatasetTypeMappingView:
151 """A mapping view of the dataset types in the graph.
153 This mapping has `str` parent dataset type name keys, but only provides
154 access to its `DatasetTypeNode` values if `resolve` has been called
155 since the last modification involving a task that uses a dataset type.
156 See `DatasetTypeMappingView` for details.
157 """
158 return self._dataset_types
160 @property
161 def task_subsets(self) -> Mapping[str, TaskSubset]:
162 """A mapping of all labeled subsets of tasks.
164 Keys are subset labels, values are sets of task labels. See
165 `TaskSubset` for more information.
167 Use `add_task_subset` to add a new subset. The subsets themselves may
168 be modified in-place.
169 """
170 return self._task_subsets
172 @property
173 def is_sorted(self) -> bool:
174 """Whether this graph's tasks and dataset types are topologically
175 sorted with the exact same deterministic tiebreakers that `sort` would
176 apply.
178 This may perform (and then discard) a full sort if `has_been_sorted` is
179 `False`. If the goal is to obtain a sorted graph, it is better to just
180 call `sort` without guarding that with an ``if not graph.is_sorted``
181 check.
182 """
183 if self._sorted_keys is not None:
184 return True
185 return all(
186 sorted == unsorted
187 for sorted, unsorted in zip(
188 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True
189 )
190 )
192 @property
193 def has_been_sorted(self) -> bool:
194 """Whether this graph's tasks and dataset types have been
195 topologically sorted (with unspecified but deterministic tiebreakers)
196 since the last modification to the graph.
198 This may return `False` if the graph *happens* to be sorted but `sort`
199 was never called, but it is potentially much faster than `is_sorted`,
200 which may attempt (and then discard) a full sort if `has_been_sorted`
201 is `False`.
202 """
203 return self._sorted_keys is not None
205 def sort(self) -> None:
206 """Sort this graph's nodes topologically with deterministic (but
207 unspecified) tiebreakers.
209 This does nothing if the graph is already known to be sorted.
210 """
211 if self._sorted_keys is None:
212 try:
213 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph))
214 except networkx.NetworkXUnfeasible as err: # pragma: no cover
215 # Should't be possible to get here, because we check for cycles
216 # when adding tasks, but we guard against it anyway.
217 cycle = networkx.find_cycle(self._xgraph)
218 raise PipelineDataCycleError(
219 f"Cycle detected while attempting to sort graph: {cycle}."
220 ) from err
221 self._reorder(sorted_keys)
223 def copy(self) -> PipelineGraph:
224 """Return a copy of this graph that copies all mutable state."""
225 xgraph = self._xgraph.copy()
226 result = PipelineGraph.__new__(PipelineGraph)
227 result._init_from_args(
228 xgraph,
229 self._sorted_keys,
230 task_subsets={
231 k: TaskSubset(xgraph, v.label, set(v._members), v.description)
232 for k, v in self._task_subsets.items()
233 },
234 description=self._description,
235 universe=self.universe,
236 data_id=self._raw_data_id,
237 )
238 return result
240 def __copy__(self) -> PipelineGraph:
241 # Fully shallow copies are dangerous; we don't want shared mutable
242 # state to lead to broken class invariants.
243 return self.copy()
245 def __deepcopy__(self, memo: dict) -> PipelineGraph:
246 # Genuine deep copies are unnecessary, since we should only ever care
247 # that mutable state is copied.
248 return self.copy()
250 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None:
251 """Return the `WriteEdge` that links the producing task to the named
252 dataset type.
254 Parameters
255 ----------
256 dataset_type_name : `str`
257 Dataset type name. Must not be a component.
259 Returns
260 -------
261 edge : `WriteEdge` or `None`
262 Producing edge or `None` if there isn't one in this graph.
264 Raises
265 ------
266 DuplicateOutputError
267 Raised if there are multiple tasks defined to produce this dataset
268 type. This is only possible if the graph's dataset types are not
269 resolved.
271 Notes
272 -----
273 On resolved graphs, it may be slightly more efficient to use::
275 graph.dataset_types[dataset_type_name].producing_edge
277 but this method works on graphs with unresolved dataset types as well.
278 """
279 producer: str | None = None
280 producing_edge: WriteEdge | None = None
281 for _, _, producing_edge in self._xgraph.in_edges(
282 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
283 ):
284 assert producing_edge is not None, "Should only be None if we never loop."
285 if producer is not None:
286 raise DuplicateOutputError(
287 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} "
288 f"and {producer!r}."
289 )
290 return producing_edge
292 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]:
293 """Return the `ReadEdge` objects that link the named dataset type to
294 the tasks that consume it.
296 Parameters
297 ----------
298 dataset_type_name : `str`
299 Dataset type name. Must not be a component.
301 Returns
302 -------
303 edges : `list` [ `ReadEdge` ]
304 Edges that connect this dataset type to the tasks that consume it.
306 Notes
307 -----
308 On resolved graphs, it may be slightly more efficient to use::
310 graph.dataset_types[dataset_type_name].producing_edges
312 but this method works on graphs with unresolved dataset types as well.
313 """
314 return [
315 edge
316 for _, _, edge in self._xgraph.out_edges(
317 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
318 )
319 ]
321 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None:
322 """Return the `TaskNode` or `TaskInitNode` that writes the given
323 dataset type.
325 Parameters
326 ----------
327 dataset_type_name : `str`
328 Dataset type name. Must not be a component.
330 Returns
331 -------
332 edge : `TaskNode`, `TaskInitNode`, or `None`
333 Producing node or `None` if there isn't one in this graph.
335 Raises
336 ------
337 DuplicateOutputError
338 Raised if there are multiple tasks defined to produce this dataset
339 type. This is only possible if the graph's dataset types are not
340 resolved.
341 """
342 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None:
343 return self._xgraph.nodes[producing_edge.task_key]["instance"]
344 return None
346 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]:
347 """Return the `TaskNode` and/or `TaskInitNode` objects that read
348 the given dataset type.
350 Parameters
351 ----------
352 dataset_type_name : `str`
353 Dataset type name. Must not be a component.
355 Returns
356 -------
357 edges : `list` [ `ReadEdge` ]
358 Edges that connect this dataset type to the tasks that consume it.
360 Notes
361 -----
362 On resolved graphs, it may be slightly more efficient to use::
364 graph.dataset_types[dataset_type_name].producing_edges
366 but this method works on graphs with unresolved dataset types as well.
367 """
368 return [
369 self._xgraph.nodes[consuming_edge.task_key]["instance"]
370 for consuming_edge in self.consuming_edges_of(dataset_type_name)
371 ]
373 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]:
374 """Return the dataset types that are inputs to a task.
376 Parameters
377 ----------
378 task_label : `str`
379 Label for the task in the pipeline.
380 init : `bool`, optional
381 If `True`, return init-input dataset types instead of runtime
382 (including prerequisite) inputs.
384 Returns
385 -------
386 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
387 Dictionary parent dataset type name keys and either
388 `DatasetTypeNode` values (if the dataset type has been resolved)
389 or `None` values.
391 Notes
392 -----
393 To get the input edges of a task or task init node (which provide
394 information about storage class overrides nd components) use::
396 graph.tasks[task_label].iter_all_inputs()
398 or
400 graph.tasks[task_label].init.iter_all_inputs()
402 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
403 class.
404 """
405 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
406 return {
407 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
408 for edge in node.iter_all_inputs()
409 }
411 def outputs_of(
412 self, task_label: str, init: bool = False, include_automatic_connections: bool = True
413 ) -> dict[str, DatasetTypeNode | None]:
414 """Return the dataset types that are outputs of a task.
416 Parameters
417 ----------
418 task_label : `str`
419 Label for the task in the pipeline.
420 init : `bool`, optional
421 If `True`, return init-output dataset types instead of runtime
422 outputs.
423 include_automatic_connections : `bool`, optional
424 Whether to include automatic connections such as configs, metadata,
425 and logs.
427 Returns
428 -------
429 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
430 Dictionary parent dataset type name keys and either
431 `DatasetTypeNode` values (if the dataset type has been resolved)
432 or `None` values.
434 Notes
435 -----
436 To get the input edges of a task or task init node (which provide
437 information about storage class overrides nd components) use::
439 graph.tasks[task_label].iter_all_outputs()
441 or
443 graph.tasks[task_label].init.iter_all_outputs()
445 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
446 class.
447 """
448 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
449 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values()
450 return {
451 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
452 for edge in iterable
453 }
455 def resolve(self, registry: Registry) -> None:
456 """Resolve all dimensions and dataset types and check them for
457 consistency.
459 Resolving a graph also causes it to be sorted.
461 Parameters
462 ----------
463 registry : `lsst.daf.butler.Registry`
464 Client for the data repository to resolve against.
466 Notes
467 -----
468 The `universe` attribute is set to ``registry.dimensions`` and used to
469 set all `TaskNode.dimensions` attributes. Dataset type nodes are
470 resolved by first looking for a registry definition, then using the
471 producing task's definition, then looking for consistency between all
472 consuming task definitions.
474 Raises
475 ------
476 ConnectionTypeConsistencyError
477 Raised if a prerequisite input for one task appears as a different
478 kind of connection in any other task.
479 DuplicateOutputError
480 Raised if multiple tasks have the same dataset type as an output.
481 IncompatibleDatasetTypeError
482 Raised if different tasks have different definitions of a dataset
483 type. Different but compatible storage classes are permitted.
484 MissingDatasetTypeError
485 Raised if a dataset type definition is required to exist in the
486 data repository but none was found. This should only occur for
487 dataset types that are not produced by a task in the pipeline and
488 are consumed with different storage classes or as components by
489 tasks in the pipeline.
490 EdgesChangedError
491 Raised if ``check_edges_unchanged=True`` and the edges of a task do
492 change after import and reconfiguration.
493 """
494 node_key: NodeKey
495 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {}
496 for node_key, node_state in self._xgraph.nodes.items():
497 match node_key.node_type:
498 case NodeType.TASK:
499 task_node: TaskNode = node_state["instance"]
500 new_task_node = task_node._resolved(registry.dimensions)
501 if new_task_node is not task_node:
502 updates[node_key] = new_task_node
503 case NodeType.DATASET_TYPE:
504 dataset_type_node: DatasetTypeNode | None = node_state["instance"]
505 new_dataset_type_node = DatasetTypeNode._from_edges(
506 node_key, self._xgraph, registry, previous=dataset_type_node
507 )
508 # Usage of `is`` here is intentional; `_from_edges` returns
509 # `previous=dataset_type_node` if it can determine that it
510 # doesn't need to change.
511 if new_dataset_type_node is not dataset_type_node:
512 updates[node_key] = new_dataset_type_node
513 try:
514 for node_key, node_value in updates.items():
515 self._xgraph.nodes[node_key]["instance"] = node_value
516 except Exception as err: # pragma: no cover
517 # There's no known way to get here, but we want to make it
518 # clear it's a big problem if we do.
519 raise PipelineGraphExceptionSafetyError(
520 "Error during dataset type resolution has left the graph in an inconsistent state."
521 ) from err
522 self.sort()
523 self._universe = registry.dimensions
525 ###########################################################################
526 #
527 # Graph Modification Interface:
528 #
529 # - methods to add, remove, and replace tasks;
530 #
531 # - methods to add and remove task subsets.
532 #
533 # These are all things that are usually done in a Pipeline before making a
534 # graph at all, but there may be cases where we want to modify the graph
535 # instead. (These are also the methods used to make a graph from a
536 # Pipeline, or make a graph from another graph.)
537 #
538 ###########################################################################
540 def add_task(
541 self,
542 label: str,
543 task_class: type[PipelineTask],
544 config: PipelineTaskConfig,
545 connections: PipelineTaskConnections | None = None,
546 ) -> TaskNode:
547 """Add a new task to the graph.
549 Parameters
550 ----------
551 label : `str`
552 Label for the task in the pipeline.
553 task_class : `type` [ `PipelineTask` ]
554 Class object for the task.
555 config : `PipelineTaskConfig`
556 Configuration for the task.
557 connections : `PipelineTaskConnections`, optional
558 Object that describes the dataset types used by the task. If not
559 provided, one will be constructed from the given configuration. If
560 provided, it is assumed that ``config`` has already been validated
561 and frozen.
563 Returns
564 -------
565 node : `TaskNode`
566 The new task node added to the graph.
568 Raises
569 ------
570 ValueError
571 Raised if configuration validation failed when constructing
572 ``connections``.
573 PipelineDataCycleError
574 Raised if the graph is cyclic after this addition.
575 RuntimeError
576 Raised if an unexpected exception (which will be chained) occurred
577 at a stage that may have left the graph in an inconsistent state.
578 Other exceptions should leave the graph unchanged.
580 Notes
581 -----
582 Checks for dataset type consistency and multiple producers do not occur
583 until `resolve` is called, since the resolution depends on both the
584 state of the data repository and all contributing tasks.
586 Adding new tasks removes any existing resolutions of all dataset types
587 it references and marks the graph as unsorted. It is most effiecient
588 to add all tasks up front and only then resolve and/or sort the graph.
589 """
590 task_node = TaskNode._from_imported_data(
591 key=NodeKey(NodeType.TASK, label),
592 init_key=NodeKey(NodeType.TASK_INIT, label),
593 data=_TaskNodeImportedData.configure(label, task_class, config, connections),
594 universe=self.universe,
595 )
596 self.add_task_nodes([task_node])
597 return task_node
599 def add_task_nodes(self, nodes: Iterable[TaskNode]) -> None:
600 """Add one or more existing task nodes to the graph.
602 Parameters
603 ----------
604 nodes : `~collections.abc.Iterable` [ `TaskNode` ]
605 Iterable of task nodes to add. If any tasks have resolved
606 dimensions, they must have the same dimension universe as the rest
607 of the graph.
609 Raises
610 ------
611 PipelineDataCycleError
612 Raised if the graph is cyclic after this addition.
614 Notes
615 -----
616 Checks for dataset type consistency and multiple producers do not occur
617 until `resolve` is called, since the resolution depends on both the
618 state of the data repository and all contributing tasks.
620 Adding new tasks removes any existing resolutions of all dataset types
621 it references and marks the graph as unsorted. It is most effiecient
622 to add all tasks up front and only then resolve and/or sort the graph.
623 """
624 node_data: list[tuple[NodeKey, dict[str, Any]]] = []
625 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = []
626 for task_node in nodes:
627 task_node = task_node._resolved(self._universe)
628 node_data.append(
629 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite})
630 )
631 node_data.append(
632 (
633 task_node.init.key,
634 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite},
635 )
636 )
637 # Convert the edge objects attached to the task node to networkx.
638 for read_edge in task_node.init.iter_all_inputs():
639 self._append_graph_data_from_edge(node_data, edge_data, read_edge)
640 for write_edge in task_node.init.iter_all_outputs():
641 self._append_graph_data_from_edge(node_data, edge_data, write_edge)
642 for read_edge in task_node.iter_all_inputs():
643 self._append_graph_data_from_edge(node_data, edge_data, read_edge)
644 for write_edge in task_node.iter_all_outputs():
645 self._append_graph_data_from_edge(node_data, edge_data, write_edge)
646 # Add a special edge (with no Edge instance) that connects the
647 # TaskInitNode to the runtime TaskNode.
648 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None}))
649 if not node_data and not edge_data:
650 return
651 # Checks and preparation complete; time to start the actual
652 # modification, during which it's hard to provide strong exception
653 # safety. Start by resetting the sort ordering, if there is one.
654 self._reset()
655 try:
656 self._xgraph.add_nodes_from(node_data)
657 self._xgraph.add_edges_from(edge_data)
658 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph):
659 cycle = networkx.find_cycle(self._xgraph)
660 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.")
661 except Exception:
662 # First try to roll back our changes.
663 try:
664 self._xgraph.remove_edges_from(edge_data)
665 self._xgraph.remove_nodes_from(key for key, _ in node_data)
666 except Exception as err: # pragma: no cover
667 # There's no known way to get here, but we want to make it
668 # clear it's a big problem if we do.
669 raise PipelineGraphExceptionSafetyError(
670 "Error while attempting to revert PipelineGraph modification has left the graph in "
671 "an inconsistent state."
672 ) from err
673 # Successfully rolled back; raise the original exception.
674 raise
676 def reconfigure_tasks(
677 self,
678 *args: tuple[str, PipelineTaskConfig],
679 check_edges_unchanged: bool = False,
680 assume_edges_unchanged: bool = False,
681 **kwargs: PipelineTaskConfig,
682 ) -> None:
683 """Update the configuration for one or more tasks.
685 Parameters
686 ----------
687 *args : `tuple` [ `str`, `.PipelineTaskConfig` ]
688 Positional arguments are each a 2-tuple of task label and new
689 config object. Note that the same arguments may also be passed as
690 ``**kwargs``, which is usually more readable, but task labels in
691 ``*args`` are not required to be valid Python identifiers.
692 check_edges_unchanged : `bool`, optional
693 If `True`, require the edges (connections) of the modified tasks to
694 remain unchanged after the configuration updates, and verify that
695 this is the case.
696 assume_edges_unchanged : `bool`, optional
697 If `True`, the caller declares that the edges (connections) of the
698 modified tasks will remain unchanged after the configuration
699 updates, and that it is unnecessary to check this.
700 **kwargs : `.PipelineTaskConfig`
701 New config objects or overrides to apply to copies of the current
702 config objects, with task labels as the keywords.
704 Raises
705 ------
706 ValueError
707 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
708 are both `True`, or if the same task appears twice.
709 EdgesChangedError
710 Raised if ``check_edges_unchanged=True`` and the edges of a task do
711 change.
713 Notes
714 -----
715 If reconfiguring a task causes its edges to change, any dataset type
716 nodes connected to that task (not just those whose edges have changed!)
717 will be unresolved.
718 """
719 new_configs: dict[str, PipelineTaskConfig] = {}
720 for task_label, config_update in itertools.chain(args, kwargs.items()):
721 if new_configs.setdefault(task_label, config_update) is not config_update:
722 raise ValueError(f"Config for {task_label!r} provided more than once.")
723 updates = {
724 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged)
725 for task_label, config in new_configs.items()
726 }
727 self._replace_task_nodes(
728 updates,
729 check_edges_unchanged=check_edges_unchanged,
730 assume_edges_unchanged=assume_edges_unchanged,
731 message_header=(
732 "Unexpected change in edges for task {task_label!r} from original config (A) to "
733 "new configs (B):"
734 ),
735 )
737 def remove_tasks(
738 self, labels: Iterable[str], drop_from_subsets: bool = True
739 ) -> list[tuple[TaskNode, set[str]]]:
740 """Remove one or more tasks from the graph.
742 Parameters
743 ----------
744 labels : `~collections.abc.Iterable` [ `str` ]
745 Iterable of the labels of the tasks to remove.
746 drop_from_subsets : `bool`, optional
747 If `True`, drop each removed task from any subset in which it
748 currently appears. If `False`, raise `PipelineGraphError` if any
749 such subsets exist.
751 Returns
752 -------
753 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ]
754 List of nodes removed and the labels of task subsets that
755 referenced them.
757 Raises
758 ------
759 PipelineGraphError
760 Raised if ``drop_from_subsets`` is `False` and the task is still
761 part of one or more subsets.
763 Notes
764 -----
765 Removing a task will cause dataset nodes with no other referencing
766 tasks to be removed. Any other dataset type nodes referenced by a
767 removed task will be reset to an "unresolved" state.
768 """
769 task_nodes_and_subsets = []
770 dataset_types: set[NodeKey] = set()
771 nodes_to_remove = set()
772 for label in labels:
773 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"]
774 # Find task subsets that reference this task.
775 referencing_subsets = {
776 subset_label
777 for subset_label, task_subset in self.task_subsets.items()
778 if label in task_subset
779 }
780 if not drop_from_subsets and referencing_subsets:
781 raise PipelineGraphError(
782 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}."
783 )
784 task_nodes_and_subsets.append((task_node, referencing_subsets))
785 # Find dataset types referenced by this task.
786 dataset_types.update(self._xgraph.predecessors(task_node.key))
787 dataset_types.update(self._xgraph.successors(task_node.key))
788 dataset_types.update(self._xgraph.predecessors(task_node.init.key))
789 dataset_types.update(self._xgraph.successors(task_node.init.key))
790 # Since there's an edge between the task and its init node, we'll
791 # have added those two nodes here, too, and we don't want that.
792 dataset_types.remove(task_node.init.key)
793 dataset_types.remove(task_node.key)
794 # Mark the task node and its init node for removal from the graph.
795 nodes_to_remove.add(task_node.key)
796 nodes_to_remove.add(task_node.init.key)
797 # Process the referenced datasets to see which ones are orphaned and
798 # need to be removed vs. just unresolved.
799 nodes_to_unresolve = []
800 for dataset_type_key in dataset_types:
801 related_tasks = set()
802 related_tasks.update(self._xgraph.predecessors(dataset_type_key))
803 related_tasks.update(self._xgraph.successors(dataset_type_key))
804 related_tasks.difference_update(nodes_to_remove)
805 if not related_tasks:
806 nodes_to_remove.add(dataset_type_key)
807 else:
808 nodes_to_unresolve.append(dataset_type_key)
809 # Checks and preparation complete; time to start the actual
810 # modification, during which it's hard to provide strong exception
811 # safety. Start by resetting the sort ordering.
812 self._reset()
813 try:
814 for dataset_type_key in nodes_to_unresolve:
815 self._xgraph.nodes[dataset_type_key]["instance"] = None
816 for task_node, referencing_subsets in task_nodes_and_subsets:
817 for subset_label in referencing_subsets:
818 self._task_subsets[subset_label].remove(task_node.label)
819 self._xgraph.remove_nodes_from(nodes_to_remove)
820 except Exception as err: # pragma: no cover
821 # There's no known way to get here, but we want to make it
822 # clear it's a big problem if we do.
823 raise PipelineGraphExceptionSafetyError(
824 "Error during task removal has left the graph in an inconsistent state."
825 ) from err
826 return task_nodes_and_subsets
828 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None:
829 """Add a label for a set of tasks that are already in the pipeline.
831 Parameters
832 ----------
833 subset_label : `str`
834 Label for this set of tasks.
835 task_labels : `~collections.abc.Iterable` [ `str` ]
836 Labels of the tasks to include in the set. All must already be
837 included in the graph.
838 description : `str`, optional
839 String description to associate with this label.
840 """
841 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description)
842 self._task_subsets[subset_label] = subset
844 def remove_task_subset(self, subset_label: str) -> None:
845 """Remove a labeled set of tasks."""
846 del self._task_subsets[subset_label]
848 ###########################################################################
849 #
850 # NetworkX Export Interface:
851 #
852 # - methods to export the PipelineGraph's content (or various subsets
853 # thereof) as NetworkX objects.
854 #
855 # These are particularly useful when writing tools to visualize the graph,
856 # while providing options for which aspects of the graph (tasks, dataset
857 # types, or both) to include, since all exported graphs have similar
858 # attributes regardless of their structure.
859 #
860 ###########################################################################
862 def make_xgraph(self) -> networkx.MultiDiGraph:
863 """Export a networkx representation of the full pipeline graph,
864 including both init and runtime edges.
866 Returns
867 -------
868 xgraph : `networkx.MultiDiGraph`
869 Directed acyclic graph with parallel edges.
871 Notes
872 -----
873 The returned graph uses `NodeKey` instances for nodes. Parallel edges
874 represent the same dataset type appearing in multiple connections for
875 the same task, and are hence rare. The connection name is used as the
876 edge key to disambiguate those parallel edges.
878 Almost all edges connect dataset type nodes to task or task init nodes
879 or vice versa, but there is also a special edge that connects each task
880 init node to its runtime node. The existence of these edges makes the
881 graph not quite bipartite, though its init-only and runtime-only
882 subgraphs are bipartite.
884 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
885 `WriteEdge` for the descriptive node and edge attributes added.
886 """
887 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False)
889 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph:
890 """Return a bipartite networkx representation of just the runtime or
891 init-time pipeline graph.
893 Parameters
894 ----------
895 init : `bool`, optional
896 If `True` (`False` is default) return the graph of task
897 initialization nodes and init input/output dataset types, instead
898 of the graph of runtime task nodes and regular
899 input/output/prerequisite dataset types.
901 Returns
902 -------
903 xgraph : `networkx.MultiDiGraph`
904 Directed acyclic graph with parallel edges.
906 Notes
907 -----
908 The returned graph uses `NodeKey` instances for nodes. Parallel edges
909 represent the same dataset type appearing in multiple connections for
910 the same task, and are hence rare. The connection name is used as the
911 edge key to disambiguate those parallel edges.
913 This graph is bipartite because each dataset type node only has edges
914 that connect it to a task [init] node, and vice versa.
916 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
917 `WriteEdge` for the descriptive node and edge attributes added.
918 """
919 return self._transform_xgraph_state(
920 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False
921 )
923 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph:
924 """Return a networkx representation of just the tasks in the pipeline.
926 Parameters
927 ----------
928 init : `bool`, optional
929 If `True` (`False` is default) return the graph of task
930 initialization nodes, instead of the graph of runtime task nodes.
932 Returns
933 -------
934 xgraph : `networkx.DiGraph`
935 Directed acyclic graph with no parallel edges.
937 Notes
938 -----
939 The returned graph uses `NodeKey` instances for nodes. The dataset
940 types that link these tasks are not represented at all; edges have no
941 attributes, and there are no parallel edges.
943 See `TaskNode` and `TaskInitNode` for the descriptive node and
944 attributes added.
945 """
946 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
947 task_keys = [
948 key
949 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
950 if bipartite == NodeType.TASK.bipartite
951 ]
952 return self._transform_xgraph_state(
953 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys),
954 skip_edges=True,
955 )
957 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph:
958 """Return a networkx representation of just the dataset types in the
959 pipeline.
961 Parameters
962 ----------
963 init : `bool`, optional
964 If `True` (`False` is default) return the graph of init input and
965 output dataset types, instead of the graph of runtime (input,
966 output, prerequisite input) dataset types.
968 Returns
969 -------
970 xgraph : `networkx.DiGraph`
971 Directed acyclic graph with no parallel edges.
973 Notes
974 -----
975 The returned graph uses `NodeKey` instances for nodes. The tasks that
976 link these tasks are not represented at all; edges have no attributes,
977 and there are no parallel edges.
979 See `DatasetTypeNode` for the descriptive node and attributes added.
980 """
981 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
982 dataset_type_keys = [
983 key
984 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
985 if bipartite == NodeType.DATASET_TYPE.bipartite
986 ]
987 return self._transform_xgraph_state(
988 networkx.algorithms.bipartite.projected_graph(
989 networkx.DiGraph(bipartite_xgraph), dataset_type_keys
990 ),
991 skip_edges=True,
992 )
994 ###########################################################################
995 #
996 # Serialization Interface.
997 #
998 # Serialization of PipelineGraphs is currently experimental and may not be
999 # retained in the future. All serialization methods are
1000 # underscore-prefixed to ensure nobody mistakes them for a stable interface
1001 # (let a lone a stable file format).
1002 #
1003 ###########################################################################
1005 @classmethod
1006 def _read_stream(
1007 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1008 ) -> PipelineGraph:
1009 """Read a serialized `PipelineGraph` from a file-like object.
1011 Parameters
1012 ----------
1013 stream : `BinaryIO`
1014 File-like object opened for binary reading, containing
1015 gzip-compressed JSON.
1016 import_mode : `TaskImportMode`, optional
1017 Whether to import tasks, and how to reconcile any differences
1018 between the imported task's connections and the those that were
1019 persisted with the graph. Default is to check that they are the
1020 same.
1022 Returns
1023 -------
1024 graph : `PipelineGraph`
1025 Deserialized pipeline graph.
1027 Raises
1028 ------
1029 PipelineGraphReadError
1030 Raised if the serialized `PipelineGraph` is not self-consistent.
1031 EdgesChangedError
1032 Raised if ``import_mode`` is
1033 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1034 did change after import and reconfiguration.
1036 Notes
1037 -----
1038 `PipelineGraph` serialization is currently experimental and may be
1039 removed or significantly changed in the future, with no deprecation
1040 period.
1041 """
1042 from .io import SerializedPipelineGraph
1044 with gzip.open(stream, "rb") as uncompressed_stream:
1045 data = json.load(uncompressed_stream)
1046 serialized_graph = SerializedPipelineGraph.parse_obj(data)
1047 return serialized_graph.deserialize(import_mode)
1049 @classmethod
1050 def _read_uri(
1051 cls,
1052 uri: ResourcePathExpression,
1053 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES,
1054 ) -> PipelineGraph:
1055 """Read a serialized `PipelineGraph` from a file at a URI.
1057 Parameters
1058 ----------
1059 uri : convertible to `lsst.resources.ResourcePath`
1060 URI to a gzip-compressed JSON file containing a serialized pipeline
1061 graph.
1062 import_mode : `TaskImportMode`, optional
1063 Whether to import tasks, and how to reconcile any differences
1064 between the imported task's connections and the those that were
1065 persisted with the graph. Default is to check that they are the
1066 same.
1068 Returns
1069 -------
1070 graph : `PipelineGraph`
1071 Deserialized pipeline graph.
1073 Raises
1074 ------
1075 PipelineGraphReadError
1076 Raised if the serialized `PipelineGraph` is not self-consistent.
1077 EdgesChangedError
1078 Raised if ``import_mode`` is
1079 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1080 did change after import and reconfiguration.
1082 Notes
1083 -----
1084 `PipelineGraph` serialization is currently experimental and may be
1085 removed or significantly changed in the future, with no deprecation
1086 period.
1087 """
1088 uri = ResourcePath(uri)
1089 with uri.open("rb") as stream:
1090 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode)
1092 def _write_stream(self, stream: BinaryIO) -> None:
1093 """Write the pipeline to a file-like object.
1095 Parameters
1096 ----------
1097 stream
1098 File-like object opened for binary writing.
1100 Notes
1101 -----
1102 `PipelineGraph` serialization is currently experimental and may be
1103 removed or significantly changed in the future, with no deprecation
1104 period.
1106 The file format is gzipped JSON, and is intended to be human-readable,
1107 but it should not be considered a stable public interface for outside
1108 code, which should always use `PipelineGraph` methods (or at least the
1109 `io.SerializedPipelineGraph` class) to read these files.
1110 """
1111 from .io import SerializedPipelineGraph
1113 with gzip.open(stream, mode="wb") as compressed_stream:
1114 compressed_stream.write(
1115 SerializedPipelineGraph.serialize(self).json(exclude_defaults=True).encode("utf-8")
1116 )
1118 def _write_uri(self, uri: ResourcePathExpression) -> None:
1119 """Write the pipeline to a file given a URI.
1121 Parameters
1122 ----------
1123 uri : convertible to `lsst.resources.ResourcePath`
1124 URI to write to . May have ``.json.gz`` or no extension (which
1125 will cause a ``.json.gz`` extension to be added).
1127 Notes
1128 -----
1129 `PipelineGraph` serialization is currently experimental and may be
1130 removed or significantly changed in the future, with no deprecation
1131 period.
1133 The file format is gzipped JSON, and is intended to be human-readable,
1134 but it should not be considered a stable public interface for outside
1135 code, which should always use `PipelineGraph` methods (or at least the
1136 `io.SerializedPipelineGraph` class) to read these files.
1137 """
1138 uri = ResourcePath(uri)
1139 extension = uri.getExtension()
1140 if not extension:
1141 uri = uri.updatedExtension(".json.gz")
1142 elif extension != ".json.gz":
1143 raise ValueError("Expanded pipeline files should always have a .json.gz extension.")
1144 with uri.open(mode="wb") as stream:
1145 self._write_stream(cast(BinaryIO, stream))
1147 def _import_and_configure(
1148 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1149 ) -> None:
1150 """Import the `PipelineTask` classes referenced by all task nodes and
1151 update those nodes accordingly.
1153 Parameters
1154 ----------
1155 import_mode : `TaskImportMode`, optional
1156 Whether to import tasks, and how to reconcile any differences
1157 between the imported task's connections and the those that were
1158 persisted with the graph. Default is to check that they are the
1159 same. This method does nothing if this is
1160 `TaskImportMode.DO_NOT_IMPORT`.
1162 Raises
1163 ------
1164 EdgesChangedError
1165 Raised if ``import_mode`` is
1166 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1167 did change after import and reconfiguration.
1169 Notes
1170 -----
1171 This method shouldn't need to be called unless the graph was
1172 deserialized without importing and configuring immediately, which is
1173 not the default behavior (but it can greatly speed up deserialization).
1174 If all tasks have already been imported this does nothing.
1176 Importing and configuring a task can change its
1177 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output,
1178 usually because the software used to read a serialized graph is newer
1179 than the software used to write it (e.g. a new config option has been
1180 added, or the task was moved to a new module with a forwarding alias
1181 left behind). These changes are allowed by
1182 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`.
1184 If importing and configuring a task causes its edges to change, any
1185 dataset type nodes linked to those edges will be reset to the
1186 unresolved state.
1187 """
1188 if import_mode is TaskImportMode.DO_NOT_IMPORT:
1189 return
1190 rebuild = (
1191 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES
1192 or import_mode is TaskImportMode.OVERRIDE_EDGES
1193 )
1194 updates: dict[str, TaskNode] = {}
1195 node_key: NodeKey
1196 for node_key, node_state in self._xgraph.nodes.items():
1197 if node_key.node_type is NodeType.TASK:
1198 task_node: TaskNode = node_state["instance"]
1199 new_task_node = task_node._imported_and_configured(rebuild)
1200 if new_task_node is not task_node:
1201 updates[task_node.label] = new_task_node
1202 self._replace_task_nodes(
1203 updates,
1204 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES),
1205 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES),
1206 message_header=(
1207 "In task with label {task_label!r}, persisted edges (A)"
1208 "differ from imported and configured edges (B):"
1209 ),
1210 )
1212 ###########################################################################
1213 #
1214 # Advanced PipelineGraph Inspection Interface:
1215 #
1216 # - methods to iterate over all nodes and edges, utilizing NodeKeys;
1217 #
1218 # - methods to find overall inputs and group nodes by their dimensions,
1219 # which are important operations for QuantumGraph generation.
1220 #
1221 ###########################################################################
1223 def iter_edges(self, init: bool = False) -> Iterator[Edge]:
1224 """Iterate over edges in the graph.
1226 Parameters
1227 ----------
1228 init : `bool`, optional
1229 If `True` (`False` is default) iterate over the edges between task
1230 initialization node and init input/output dataset types, instead of
1231 the runtime task nodes and regular input/output/prerequisite
1232 dataset types.
1234 Returns
1235 -------
1236 edges : `~collections.abc.Iterator` [ `Edge` ]
1237 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances.
1239 Notes
1240 -----
1241 This method always returns _either_ init edges or runtime edges, never
1242 both. The full (internal) graph that contains both also includes a
1243 special edge that connects each task init node to its runtime node;
1244 that is also never returned by this method, since it is never a part of
1245 the init-only or runtime-only subgraphs.
1246 """
1247 edge: Edge
1248 for _, _, edge in self._xgraph.edges(data="instance"):
1249 if edge is not None and edge.is_init == init:
1250 yield edge
1252 def iter_nodes(
1253 self,
1254 ) -> Iterator[
1255 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode]
1256 | tuple[Literal[NodeType.TASK], str, TaskInitNode]
1257 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None]
1258 ]:
1259 """Iterate over nodes in the graph.
1261 Returns
1262 -------
1263 nodes : `~collections.abc.Iterator` [ `tuple` ]
1264 A lazy iterator over all of the nodes in the graph. Each yielded
1265 element is a tuple of:
1267 - the node type enum value (`NodeType`);
1268 - the string name for the node (task label or parent dataset type
1269 name);
1270 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`,
1271 or `None` for dataset type nodes that have not been resolved).
1272 """
1273 key: NodeKey
1274 if self._sorted_keys is not None:
1275 for key in self._sorted_keys:
1276 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore
1277 else:
1278 for key, node in self._xgraph.nodes(data="instance"):
1279 yield key.node_type, key.name, node # type: ignore
1281 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]:
1282 """Iterate over all of the dataset types that are consumed but not
1283 produced by the graph.
1285 Returns
1286 -------
1287 dataset_types : `~collections.abc.Iterator` [ `tuple` ]
1288 A lazy iterator over the overall-input dataset types (including
1289 overall init inputs and prerequisites). Each yielded element is a
1290 tuple of:
1292 - the parent dataset type name;
1293 - the resolved `DatasetTypeNode`, or `None` if the dataset type has
1294 - not been resolved.
1295 """
1296 for generation in networkx.algorithms.dag.topological_generations(self._xgraph):
1297 key: NodeKey
1298 for key in generation:
1299 # While we expect all tasks to have at least one input and
1300 # hence never appear in the first topological generation, that
1301 # is not true of task init nodes.
1302 if key.node_type is NodeType.DATASET_TYPE:
1303 yield key.name, self._xgraph.nodes[key]["instance"]
1304 return
1306 def group_by_dimensions(
1307 self, prerequisites: bool = False
1308 ) -> dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]:
1309 """Group this graph's tasks and dataset types by their dimensions.
1311 Parameters
1312 ----------
1313 prerequisites : `bool`, optional
1314 If `True`, include prerequisite dataset types as well as regular
1315 input and output datasets (including intermediates).
1317 Returns
1318 -------
1319 groups : `dict` [ `DimensionGraph`, `tuple` ]
1320 A dictionary of groups keyed by `DimensionGraph`, in which each
1321 value is a tuple of:
1323 - a `dict` of `TaskNode` instances, keyed by task label
1324 - a `dict` of `DatasetTypeNode` instances, keyed by
1325 dataset type name.
1327 that have those dimensions.
1329 Notes
1330 -----
1331 Init inputs and outputs are always included, but always have empty
1332 dimensions and are hence are all grouped together.
1333 """
1334 result: dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {}
1335 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {})
1336 for task_label, task_node in self.tasks.items():
1337 if task_node.dimensions is None:
1338 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.")
1339 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value:
1340 next_new_value = ({}, {}) # make new lists for next time
1341 group[0][task_node.label] = task_node
1342 for dataset_type_name, dataset_type_node in self.dataset_types.items():
1343 if dataset_type_node is None:
1344 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.")
1345 if not dataset_type_node.is_prerequisite or prerequisites:
1346 if (
1347 group := result.setdefault(dataset_type_node.dataset_type.dimensions, next_new_value)
1348 ) is next_new_value:
1349 next_new_value = ({}, {}) # make new lists for next time
1350 group[1][dataset_type_node.name] = dataset_type_node
1351 return result
1353 ###########################################################################
1354 #
1355 # Class- and Package-Private Methods.
1356 #
1357 ###########################################################################
1359 def _iter_task_defs(self) -> Iterator[TaskDef]:
1360 """Iterate over this pipeline as a sequence of `TaskDef` instances.
1362 Notes
1363 -----
1364 This is a package-private method intended to aid in the transition to a
1365 codebase more fully integrated with the `PipelineGraph` class, in which
1366 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and
1367 much of the functionality on the `Pipeline` class will be moved to
1368 `PipelineGraph` as well.
1370 Raises
1371 ------
1372 TaskNotImportedError
1373 Raised if `TaskNode.is_imported` is `False` for any task.
1374 """
1375 from ..pipeline import TaskDef
1377 for node in self._tasks.values():
1378 yield TaskDef(
1379 config=node.config,
1380 taskClass=node.task_class,
1381 label=node.label,
1382 connections=node._get_imported_data().connections,
1383 )
1385 def _init_from_args(
1386 self,
1387 xgraph: networkx.MultiDiGraph | None,
1388 sorted_keys: Sequence[NodeKey] | None,
1389 task_subsets: dict[str, TaskSubset] | None,
1390 description: str,
1391 universe: DimensionUniverse | None,
1392 data_id: DataId | None,
1393 ) -> None:
1394 """Initialize the graph with possibly-nontrivial arguments.
1396 Parameters
1397 ----------
1398 xgraph : `networkx.MultiDiGraph` or `None`
1399 The backing networkx graph, or `None` to create an empty one.
1400 This graph has `NodeKey` instances for nodes and the same structure
1401 as the graph exported by `make_xgraph`, but its nodes and edges
1402 have a single ``instance`` attribute that holds a `TaskNode`,
1403 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or
1404 `WriteEdge` instance.
1405 sorted_keys : `Sequence` [ `NodeKey` ] or `None`
1406 Topologically sorted sequence of node keys, or `None` if the graph
1407 is not sorted.
1408 task_subsets : `dict` [ `str`, `TaskSubset` ]
1409 Labeled subsets of tasks. Values must be constructed with
1410 ``xgraph`` as their parent graph.
1411 description : `str`
1412 String description for this pipeline.
1413 universe : `lsst.daf.butler.DimensionUniverse` or `None`
1414 Definitions of all dimensions.
1415 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping.
1416 Data ID that represents a constraint on all quanta generated from
1417 this pipeline.
1419 Notes
1420 -----
1421 Only empty `PipelineGraph` instances should be constructed directly by
1422 users, which sets the signature of ``__init__`` itself, but methods on
1423 `PipelineGraph` and its helper classes need to be able to create them
1424 with state. Those methods can call this after calling ``__new__``
1425 manually, skipping ``__init__``.
1426 """
1427 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph()
1428 self._sorted_keys: Sequence[NodeKey] | None = None
1429 self._task_subsets = task_subsets if task_subsets is not None else {}
1430 self._description = description
1431 self._tasks = TaskMappingView(self._xgraph)
1432 self._dataset_types = DatasetTypeMappingView(self._xgraph)
1433 self._raw_data_id: dict[str, Any]
1434 if isinstance(data_id, DataCoordinate):
1435 if universe is None:
1436 universe = data_id.universe
1437 else:
1438 assert universe is data_id.universe, "data_id.universe and given universe differ"
1439 self._raw_data_id = data_id.byName()
1440 elif data_id is None:
1441 self._raw_data_id = {}
1442 else:
1443 self._raw_data_id = dict(data_id)
1444 self._universe = universe
1445 if sorted_keys is not None:
1446 self._reorder(sorted_keys)
1448 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph:
1449 """Make a bipartite init-only or runtime-only internal subgraph.
1451 See `make_bipartite_xgraph` for parameters and return values.
1453 Notes
1454 -----
1455 This method returns a view of the `PipelineGraph` object's internal
1456 backing graph, and hence should only be called in methods that copy the
1457 result either explicitly or by running a copying algorithm before
1458 returning it to the user.
1459 """
1460 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)])
1462 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G:
1463 """Transform networkx graph attributes in-place from the internal
1464 "instance" attributes to the documented exported attributes.
1466 Parameters
1467 ----------
1468 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1469 Graph whose state should be transformed.
1470 skip_edges : `bool`
1471 If `True`, do not transform edge state.
1473 Returns
1474 -------
1475 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1476 The same object passed in, after modification.
1478 Notes
1479 -----
1480 This should be called after making a copy of the internal graph but
1481 before any projection down to just task or dataset type nodes, since
1482 it assumes stateful edges.
1483 """
1484 state: dict[str, Any]
1485 for state in xgraph.nodes.values():
1486 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance")
1487 if node_value is not None:
1488 state.update(node_value._to_xgraph_state())
1489 if not skip_edges:
1490 for _, _, state in xgraph.edges(data=True):
1491 edge: Edge | None = state.pop("instance", None)
1492 if edge is not None:
1493 state.update(edge._to_xgraph_state())
1494 return xgraph
1496 def _replace_task_nodes(
1497 self,
1498 updates: Mapping[str, TaskNode],
1499 check_edges_unchanged: bool,
1500 assume_edges_unchanged: bool,
1501 message_header: str,
1502 ) -> None:
1503 """Replace task nodes and update edges and dataset type nodes
1504 accordingly.
1506 Parameters
1507 ----------
1508 updates : `Mapping` [ `str`, `TaskNode` ]
1509 New task nodes with task label keys. All keys must be task labels
1510 that are already present in the graph.
1511 check_edges_unchanged : `bool`, optional
1512 If `True`, require the edges (connections) of the modified tasks to
1513 remain unchanged after importing and configuring each task, and
1514 verify that this is the case.
1515 assume_edges_unchanged : `bool`, optional
1516 If `True`, the caller declares that the edges (connections) of the
1517 modified tasks will remain unchanged importing and configuring each
1518 task, and that it is unnecessary to check this.
1519 message_header : `str`
1520 Template for `str.format` with a single ``task_label`` placeholder
1521 to use as the first line in `EdgesChangedError` messages that show
1522 the differences between new task edges and old task edges. Should
1523 include the fact that the rest of the message will refer to the old
1524 task as "A" and the new task as "B", and end with a colon.
1526 Raises
1527 ------
1528 ValueError
1529 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
1530 are both `True`, or if a full config is provided for a task after
1531 another full config or an override has already been provided.
1532 EdgesChangedError
1533 Raised if ``check_edges_unchanged=True`` and the edges of a task do
1534 change.
1535 """
1536 deep: dict[str, TaskNode] = {}
1537 shallow: dict[str, TaskNode] = {}
1538 if assume_edges_unchanged:
1539 if check_edges_unchanged:
1540 raise ValueError("Cannot simultaneously assume and check that edges have not changed.")
1541 shallow.update(updates)
1542 else:
1543 for task_label, new_task_node in updates.items():
1544 old_task_node = self.tasks[task_label]
1545 messages = old_task_node.diff_edges(new_task_node)
1546 if messages:
1547 if check_edges_unchanged:
1548 messages.insert(0, message_header.format(task_label=task_label))
1549 raise EdgesChangedError("\n".join(messages))
1550 else:
1551 deep[task_label] = new_task_node
1552 else:
1553 shallow[task_label] = new_task_node
1554 try:
1555 if deep:
1556 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True)
1557 self.add_task_nodes(deep.values())
1558 for replaced_task_node, referencing_subsets in removed:
1559 for subset_label in referencing_subsets:
1560 self._task_subsets[subset_label].add(replaced_task_node.label)
1561 for task_node in shallow.values():
1562 self._xgraph.nodes[task_node.key]["instance"] = task_node
1563 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init
1564 except PipelineGraphExceptionSafetyError: # pragma: no cover
1565 raise
1566 except Exception as err: # pragma: no cover
1567 # There's no known way to get here, but we want to make it clear
1568 # it's a big problem if we do.
1569 raise PipelineGraphExceptionSafetyError(
1570 "Error while replacing tasks has left the graph in an inconsistent state."
1571 ) from err
1573 def _append_graph_data_from_edge(
1574 self,
1575 node_data: list[tuple[NodeKey, dict[str, Any]]],
1576 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]],
1577 edge: Edge,
1578 ) -> None:
1579 """Append networkx state dictionaries for an edge and the corresponding
1580 dataset type node.
1582 Parameters
1583 ----------
1584 node_data : `list`
1585 List of node keys and state dictionaries. A node is appended if
1586 one does not already exist for this dataset type.
1587 edge_data : `list`
1588 List of node key pairs, connection names, and state dictionaries
1589 for edges.
1590 edge : `Edge`
1591 New edge being processed.
1592 """
1593 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None:
1594 existing_dataset_type_state["instance"] = None
1595 else:
1596 node_data.append(
1597 (
1598 edge.dataset_type_key,
1599 {
1600 "instance": None,
1601 "bipartite": NodeType.DATASET_TYPE.bipartite,
1602 },
1603 )
1604 )
1605 edge_data.append(
1606 edge.nodes
1607 + (
1608 edge.connection_name,
1609 {"instance": edge},
1610 )
1611 )
1613 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None:
1614 """Set the order of all views of this graph from the given sorted
1615 sequence of task labels and dataset type names.
1616 """
1617 self._sorted_keys = sorted_keys
1618 self._tasks._reorder(sorted_keys)
1619 self._dataset_types._reorder(sorted_keys)
1621 def _reset(self) -> None:
1622 """Reset the all views of this graph following a modification that
1623 might invalidate them.
1624 """
1625 self._sorted_keys = None
1626 self._tasks._reset()
1627 self._dataset_types._reset()
1629 _xgraph: networkx.MultiDiGraph
1630 _sorted_keys: Sequence[NodeKey] | None
1631 _task_subsets: dict[str, TaskSubset]
1632 _description: str
1633 _tasks: TaskMappingView
1634 _dataset_types: DatasetTypeMappingView
1635 _raw_data_id: dict[str, Any]
1636 _universe: DimensionUniverse | None