Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 19%
373 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-13 09:52 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-13 09:52 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("PipelineGraph",)
31import gzip
32import itertools
33import json
34from collections.abc import Iterable, Iterator, Mapping, Sequence
35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast
37import networkx
38import networkx.algorithms.bipartite
39import networkx.algorithms.dag
40from lsst.daf.butler import DataCoordinate, DataId, DimensionGraph, DimensionUniverse, Registry
41from lsst.resources import ResourcePath, ResourcePathExpression
43from ._dataset_types import DatasetTypeNode
44from ._edges import Edge, ReadEdge, WriteEdge
45from ._exceptions import (
46 DuplicateOutputError,
47 EdgesChangedError,
48 PipelineDataCycleError,
49 PipelineGraphError,
50 PipelineGraphExceptionSafetyError,
51 UnresolvedGraphError,
52)
53from ._mapping_views import DatasetTypeMappingView, TaskMappingView
54from ._nodes import NodeKey, NodeType
55from ._task_subsets import TaskSubset
56from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData
58if TYPE_CHECKING:
59 from ..config import PipelineTaskConfig
60 from ..connections import PipelineTaskConnections
61 from ..pipeline import TaskDef
62 from ..pipelineTask import PipelineTask
65_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph)
68class PipelineGraph:
69 """A graph representation of fully-configured pipeline.
71 `PipelineGraph` instances are typically constructed by calling
72 `.Pipeline.to_graph`, but in rare cases constructing and then populating an
73 empty one may be preferable.
75 Parameters
76 ----------
77 description : `str`, optional
78 String description for this pipeline.
79 universe : `lsst.daf.butler.DimensionUniverse`, optional
80 Definitions for all butler dimensions. If not provided, some
81 attributes will not be available until `resolve` is called.
82 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional
83 Data ID that represents a constraint on all quanta generated by this
84 pipeline. This typically just holds the instrument constraint included
85 in the pipeline definition, if there was one.
86 """
88 ###########################################################################
89 #
90 # Simple Pipeline Graph Inspection Interface:
91 #
92 # - for inspecting graph structure, not modifying it (except to sort and]
93 # resolve);
94 #
95 # - no NodeKey objects, just string dataset type name and task label keys;
96 #
97 # - graph structure is represented as a pair of mappings, with methods to
98 # find neighbors and edges of nodes.
99 #
100 ###########################################################################
102 def __init__(
103 self,
104 *,
105 description: str = "",
106 universe: DimensionUniverse | None = None,
107 data_id: DataId | None = None,
108 ) -> None:
109 self._init_from_args(
110 xgraph=None,
111 sorted_keys=None,
112 task_subsets=None,
113 description=description,
114 universe=universe,
115 data_id=data_id,
116 )
118 def __repr__(self) -> str:
119 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})"
121 @property
122 def description(self) -> str:
123 """String description for this pipeline."""
124 return self._description
126 @description.setter
127 def description(self, value: str) -> None:
128 # Docstring in setter.
129 self._description = value
131 @property
132 def universe(self) -> DimensionUniverse | None:
133 """Definitions for all butler dimensions."""
134 return self._universe
136 @property
137 def data_id(self) -> DataCoordinate:
138 """Data ID that represents a constraint on all quanta generated from
139 this pipeline.
141 This is may not be available unless `universe` is not `None`.
142 """
143 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe)
145 @property
146 def tasks(self) -> TaskMappingView:
147 """A mapping view of the tasks in the graph.
149 This mapping has `str` task label keys and `TaskNode` values. Iteration
150 is topologically and deterministically ordered if and only if `sort`
151 has been called since the last modification to the graph.
152 """
153 return self._tasks
155 @property
156 def dataset_types(self) -> DatasetTypeMappingView:
157 """A mapping view of the dataset types in the graph.
159 This mapping has `str` parent dataset type name keys, but only provides
160 access to its `DatasetTypeNode` values if `resolve` has been called
161 since the last modification involving a task that uses a dataset type.
162 See `DatasetTypeMappingView` for details.
163 """
164 return self._dataset_types
166 @property
167 def task_subsets(self) -> Mapping[str, TaskSubset]:
168 """A mapping of all labeled subsets of tasks.
170 Keys are subset labels, values are sets of task labels. See
171 `TaskSubset` for more information.
173 Use `add_task_subset` to add a new subset. The subsets themselves may
174 be modified in-place.
175 """
176 return self._task_subsets
178 @property
179 def is_sorted(self) -> bool:
180 """Whether this graph's tasks and dataset types are topologically
181 sorted with the exact same deterministic tiebreakers that `sort` would
182 apply.
184 This may perform (and then discard) a full sort if `has_been_sorted` is
185 `False`. If the goal is to obtain a sorted graph, it is better to just
186 call `sort` without guarding that with an ``if not graph.is_sorted``
187 check.
188 """
189 if self._sorted_keys is not None:
190 return True
191 return all(
192 sorted == unsorted
193 for sorted, unsorted in zip(
194 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True
195 )
196 )
198 @property
199 def has_been_sorted(self) -> bool:
200 """Whether this graph's tasks and dataset types have been
201 topologically sorted (with unspecified but deterministic tiebreakers)
202 since the last modification to the graph.
204 This may return `False` if the graph *happens* to be sorted but `sort`
205 was never called, but it is potentially much faster than `is_sorted`,
206 which may attempt (and then discard) a full sort if `has_been_sorted`
207 is `False`.
208 """
209 return self._sorted_keys is not None
211 def sort(self) -> None:
212 """Sort this graph's nodes topologically with deterministic (but
213 unspecified) tiebreakers.
215 This does nothing if the graph is already known to be sorted.
216 """
217 if self._sorted_keys is None:
218 try:
219 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph))
220 except networkx.NetworkXUnfeasible as err: # pragma: no cover
221 # Should't be possible to get here, because we check for cycles
222 # when adding tasks, but we guard against it anyway.
223 cycle = networkx.find_cycle(self._xgraph)
224 raise PipelineDataCycleError(
225 f"Cycle detected while attempting to sort graph: {cycle}."
226 ) from err
227 self._reorder(sorted_keys)
229 def copy(self) -> PipelineGraph:
230 """Return a copy of this graph that copies all mutable state."""
231 xgraph = self._xgraph.copy()
232 result = PipelineGraph.__new__(PipelineGraph)
233 result._init_from_args(
234 xgraph,
235 self._sorted_keys,
236 task_subsets={
237 k: TaskSubset(xgraph, v.label, set(v._members), v.description)
238 for k, v in self._task_subsets.items()
239 },
240 description=self._description,
241 universe=self.universe,
242 data_id=self._raw_data_id,
243 )
244 return result
246 def __copy__(self) -> PipelineGraph:
247 # Fully shallow copies are dangerous; we don't want shared mutable
248 # state to lead to broken class invariants.
249 return self.copy()
251 def __deepcopy__(self, memo: dict) -> PipelineGraph:
252 # Genuine deep copies are unnecessary, since we should only ever care
253 # that mutable state is copied.
254 return self.copy()
256 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None:
257 """Return the `WriteEdge` that links the producing task to the named
258 dataset type.
260 Parameters
261 ----------
262 dataset_type_name : `str`
263 Dataset type name. Must not be a component.
265 Returns
266 -------
267 edge : `WriteEdge` or `None`
268 Producing edge or `None` if there isn't one in this graph.
270 Raises
271 ------
272 DuplicateOutputError
273 Raised if there are multiple tasks defined to produce this dataset
274 type. This is only possible if the graph's dataset types are not
275 resolved.
277 Notes
278 -----
279 On resolved graphs, it may be slightly more efficient to use::
281 graph.dataset_types[dataset_type_name].producing_edge
283 but this method works on graphs with unresolved dataset types as well.
284 """
285 producer: str | None = None
286 producing_edge: WriteEdge | None = None
287 for _, _, producing_edge in self._xgraph.in_edges(
288 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
289 ):
290 assert producing_edge is not None, "Should only be None if we never loop."
291 if producer is not None:
292 raise DuplicateOutputError(
293 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} "
294 f"and {producer!r}."
295 )
296 return producing_edge
298 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]:
299 """Return the `ReadEdge` objects that link the named dataset type to
300 the tasks that consume it.
302 Parameters
303 ----------
304 dataset_type_name : `str`
305 Dataset type name. Must not be a component.
307 Returns
308 -------
309 edges : `list` [ `ReadEdge` ]
310 Edges that connect this dataset type to the tasks that consume it.
312 Notes
313 -----
314 On resolved graphs, it may be slightly more efficient to use::
316 graph.dataset_types[dataset_type_name].producing_edges
318 but this method works on graphs with unresolved dataset types as well.
319 """
320 return [
321 edge
322 for _, _, edge in self._xgraph.out_edges(
323 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance"
324 )
325 ]
327 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None:
328 """Return the `TaskNode` or `TaskInitNode` that writes the given
329 dataset type.
331 Parameters
332 ----------
333 dataset_type_name : `str`
334 Dataset type name. Must not be a component.
336 Returns
337 -------
338 edge : `TaskNode`, `TaskInitNode`, or `None`
339 Producing node or `None` if there isn't one in this graph.
341 Raises
342 ------
343 DuplicateOutputError
344 Raised if there are multiple tasks defined to produce this dataset
345 type. This is only possible if the graph's dataset types are not
346 resolved.
347 """
348 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None:
349 return self._xgraph.nodes[producing_edge.task_key]["instance"]
350 return None
352 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]:
353 """Return the `TaskNode` and/or `TaskInitNode` objects that read
354 the given dataset type.
356 Parameters
357 ----------
358 dataset_type_name : `str`
359 Dataset type name. Must not be a component.
361 Returns
362 -------
363 edges : `list` [ `ReadEdge` ]
364 Edges that connect this dataset type to the tasks that consume it.
366 Notes
367 -----
368 On resolved graphs, it may be slightly more efficient to use::
370 graph.dataset_types[dataset_type_name].producing_edges
372 but this method works on graphs with unresolved dataset types as well.
373 """
374 return [
375 self._xgraph.nodes[consuming_edge.task_key]["instance"]
376 for consuming_edge in self.consuming_edges_of(dataset_type_name)
377 ]
379 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]:
380 """Return the dataset types that are inputs to a task.
382 Parameters
383 ----------
384 task_label : `str`
385 Label for the task in the pipeline.
386 init : `bool`, optional
387 If `True`, return init-input dataset types instead of runtime
388 (including prerequisite) inputs.
390 Returns
391 -------
392 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
393 Dictionary parent dataset type name keys and either
394 `DatasetTypeNode` values (if the dataset type has been resolved)
395 or `None` values.
397 Notes
398 -----
399 To get the input edges of a task or task init node (which provide
400 information about storage class overrides nd components) use::
402 graph.tasks[task_label].iter_all_inputs()
404 or
406 graph.tasks[task_label].init.iter_all_inputs()
408 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
409 class.
410 """
411 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
412 return {
413 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
414 for edge in node.iter_all_inputs()
415 }
417 def outputs_of(
418 self, task_label: str, init: bool = False, include_automatic_connections: bool = True
419 ) -> dict[str, DatasetTypeNode | None]:
420 """Return the dataset types that are outputs of a task.
422 Parameters
423 ----------
424 task_label : `str`
425 Label for the task in the pipeline.
426 init : `bool`, optional
427 If `True`, return init-output dataset types instead of runtime
428 outputs.
429 include_automatic_connections : `bool`, optional
430 Whether to include automatic connections such as configs, metadata,
431 and logs.
433 Returns
434 -------
435 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ]
436 Dictionary parent dataset type name keys and either
437 `DatasetTypeNode` values (if the dataset type has been resolved)
438 or `None` values.
440 Notes
441 -----
442 To get the input edges of a task or task init node (which provide
443 information about storage class overrides nd components) use::
445 graph.tasks[task_label].iter_all_outputs()
447 or
449 graph.tasks[task_label].init.iter_all_outputs()
451 or the various mapping attributes of the `TaskNode` and `TaskInitNode`
452 class.
453 """
454 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init
455 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values()
456 return {
457 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"]
458 for edge in iterable
459 }
461 def resolve(self, registry: Registry) -> None:
462 """Resolve all dimensions and dataset types and check them for
463 consistency.
465 Resolving a graph also causes it to be sorted.
467 Parameters
468 ----------
469 registry : `lsst.daf.butler.Registry`
470 Client for the data repository to resolve against.
472 Notes
473 -----
474 The `universe` attribute is set to ``registry.dimensions`` and used to
475 set all `TaskNode.dimensions` attributes. Dataset type nodes are
476 resolved by first looking for a registry definition, then using the
477 producing task's definition, then looking for consistency between all
478 consuming task definitions.
480 Raises
481 ------
482 ConnectionTypeConsistencyError
483 Raised if a prerequisite input for one task appears as a different
484 kind of connection in any other task.
485 DuplicateOutputError
486 Raised if multiple tasks have the same dataset type as an output.
487 IncompatibleDatasetTypeError
488 Raised if different tasks have different definitions of a dataset
489 type. Different but compatible storage classes are permitted.
490 MissingDatasetTypeError
491 Raised if a dataset type definition is required to exist in the
492 data repository but none was found. This should only occur for
493 dataset types that are not produced by a task in the pipeline and
494 are consumed with different storage classes or as components by
495 tasks in the pipeline.
496 EdgesChangedError
497 Raised if ``check_edges_unchanged=True`` and the edges of a task do
498 change after import and reconfiguration.
499 """
500 node_key: NodeKey
501 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {}
502 for node_key, node_state in self._xgraph.nodes.items():
503 match node_key.node_type:
504 case NodeType.TASK:
505 task_node: TaskNode = node_state["instance"]
506 new_task_node = task_node._resolved(registry.dimensions)
507 if new_task_node is not task_node:
508 updates[node_key] = new_task_node
509 case NodeType.DATASET_TYPE:
510 dataset_type_node: DatasetTypeNode | None = node_state["instance"]
511 new_dataset_type_node = DatasetTypeNode._from_edges(
512 node_key, self._xgraph, registry, previous=dataset_type_node
513 )
514 # Usage of `is`` here is intentional; `_from_edges` returns
515 # `previous=dataset_type_node` if it can determine that it
516 # doesn't need to change.
517 if new_dataset_type_node is not dataset_type_node:
518 updates[node_key] = new_dataset_type_node
519 try:
520 for node_key, node_value in updates.items():
521 self._xgraph.nodes[node_key]["instance"] = node_value
522 except Exception as err: # pragma: no cover
523 # There's no known way to get here, but we want to make it
524 # clear it's a big problem if we do.
525 raise PipelineGraphExceptionSafetyError(
526 "Error during dataset type resolution has left the graph in an inconsistent state."
527 ) from err
528 self.sort()
529 self._universe = registry.dimensions
531 ###########################################################################
532 #
533 # Graph Modification Interface:
534 #
535 # - methods to add, remove, and replace tasks;
536 #
537 # - methods to add and remove task subsets.
538 #
539 # These are all things that are usually done in a Pipeline before making a
540 # graph at all, but there may be cases where we want to modify the graph
541 # instead. (These are also the methods used to make a graph from a
542 # Pipeline, or make a graph from another graph.)
543 #
544 ###########################################################################
546 def add_task(
547 self,
548 label: str,
549 task_class: type[PipelineTask],
550 config: PipelineTaskConfig,
551 connections: PipelineTaskConnections | None = None,
552 ) -> TaskNode:
553 """Add a new task to the graph.
555 Parameters
556 ----------
557 label : `str`
558 Label for the task in the pipeline.
559 task_class : `type` [ `PipelineTask` ]
560 Class object for the task.
561 config : `PipelineTaskConfig`
562 Configuration for the task.
563 connections : `PipelineTaskConnections`, optional
564 Object that describes the dataset types used by the task. If not
565 provided, one will be constructed from the given configuration. If
566 provided, it is assumed that ``config`` has already been validated
567 and frozen.
569 Returns
570 -------
571 node : `TaskNode`
572 The new task node added to the graph.
574 Raises
575 ------
576 ValueError
577 Raised if configuration validation failed when constructing
578 ``connections``.
579 PipelineDataCycleError
580 Raised if the graph is cyclic after this addition.
581 RuntimeError
582 Raised if an unexpected exception (which will be chained) occurred
583 at a stage that may have left the graph in an inconsistent state.
584 Other exceptions should leave the graph unchanged.
586 Notes
587 -----
588 Checks for dataset type consistency and multiple producers do not occur
589 until `resolve` is called, since the resolution depends on both the
590 state of the data repository and all contributing tasks.
592 Adding new tasks removes any existing resolutions of all dataset types
593 it references and marks the graph as unsorted. It is most effiecient
594 to add all tasks up front and only then resolve and/or sort the graph.
595 """
596 task_node = TaskNode._from_imported_data(
597 key=NodeKey(NodeType.TASK, label),
598 init_key=NodeKey(NodeType.TASK_INIT, label),
599 data=_TaskNodeImportedData.configure(label, task_class, config, connections),
600 universe=self.universe,
601 )
602 self.add_task_nodes([task_node])
603 return task_node
605 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None:
606 """Add one or more existing task nodes to the graph.
608 Parameters
609 ----------
610 nodes : `~collections.abc.Iterable` [ `TaskNode` ]
611 Iterable of task nodes to add. If any tasks have resolved
612 dimensions, they must have the same dimension universe as the rest
613 of the graph.
614 parent : `PipelineGraph`, optional
615 If provided, another `PipelineGraph` from which these nodes were
616 obtained. Any dataset type nodes already present in ``parent``
617 that are referenced by the given tasks will be used in this graph
618 if they are not already present, preserving any dataset type
619 resolutions present in the parent graph. Adding nodes from a
620 parent graph after the graph has its own nodes (e.g. from
621 `add_task`) or nodes from a third graph may result in invalid
622 dataset type resolutions. It is safest to only use this argument
623 when populating an empty graph for the first time.
625 Raises
626 ------
627 PipelineDataCycleError
628 Raised if the graph is cyclic after this addition.
630 Notes
631 -----
632 Checks for dataset type consistency and multiple producers do not occur
633 until `resolve` is called, since the resolution depends on both the
634 state of the data repository and all contributing tasks.
636 Adding new tasks removes any existing resolutions of all dataset types
637 it references (unless ``parent is not None`` and marks the graph as
638 unsorted. It is most efficient to add all tasks up front and only then
639 resolve and/or sort the graph.
640 """
641 node_data: list[tuple[NodeKey, dict[str, Any]]] = []
642 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = []
643 for task_node in nodes:
644 task_node = task_node._resolved(self._universe)
645 node_data.append(
646 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite})
647 )
648 node_data.append(
649 (
650 task_node.init.key,
651 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite},
652 )
653 )
654 # Convert the edge objects attached to the task node to networkx.
655 for read_edge in task_node.init.iter_all_inputs():
656 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
657 for write_edge in task_node.init.iter_all_outputs():
658 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
659 for read_edge in task_node.iter_all_inputs():
660 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent)
661 for write_edge in task_node.iter_all_outputs():
662 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent)
663 # Add a special edge (with no Edge instance) that connects the
664 # TaskInitNode to the runtime TaskNode.
665 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None}))
666 if not node_data and not edge_data:
667 return
668 # Checks and preparation complete; time to start the actual
669 # modification, during which it's hard to provide strong exception
670 # safety. Start by resetting the sort ordering, if there is one.
671 self._reset()
672 try:
673 self._xgraph.add_nodes_from(node_data)
674 self._xgraph.add_edges_from(edge_data)
675 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph):
676 cycle = networkx.find_cycle(self._xgraph)
677 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.")
678 except Exception:
679 # First try to roll back our changes.
680 try:
681 self._xgraph.remove_edges_from(edge_data)
682 self._xgraph.remove_nodes_from(key for key, _ in node_data)
683 except Exception as err: # pragma: no cover
684 # There's no known way to get here, but we want to make it
685 # clear it's a big problem if we do.
686 raise PipelineGraphExceptionSafetyError(
687 "Error while attempting to revert PipelineGraph modification has left the graph in "
688 "an inconsistent state."
689 ) from err
690 # Successfully rolled back; raise the original exception.
691 raise
693 def reconfigure_tasks(
694 self,
695 *args: tuple[str, PipelineTaskConfig],
696 check_edges_unchanged: bool = False,
697 assume_edges_unchanged: bool = False,
698 **kwargs: PipelineTaskConfig,
699 ) -> None:
700 """Update the configuration for one or more tasks.
702 Parameters
703 ----------
704 *args : `tuple` [ `str`, `.PipelineTaskConfig` ]
705 Positional arguments are each a 2-tuple of task label and new
706 config object. Note that the same arguments may also be passed as
707 ``**kwargs``, which is usually more readable, but task labels in
708 ``*args`` are not required to be valid Python identifiers.
709 check_edges_unchanged : `bool`, optional
710 If `True`, require the edges (connections) of the modified tasks to
711 remain unchanged after the configuration updates, and verify that
712 this is the case.
713 assume_edges_unchanged : `bool`, optional
714 If `True`, the caller declares that the edges (connections) of the
715 modified tasks will remain unchanged after the configuration
716 updates, and that it is unnecessary to check this.
717 **kwargs : `.PipelineTaskConfig`
718 New config objects or overrides to apply to copies of the current
719 config objects, with task labels as the keywords.
721 Raises
722 ------
723 ValueError
724 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
725 are both `True`, or if the same task appears twice.
726 EdgesChangedError
727 Raised if ``check_edges_unchanged=True`` and the edges of a task do
728 change.
730 Notes
731 -----
732 If reconfiguring a task causes its edges to change, any dataset type
733 nodes connected to that task (not just those whose edges have changed!)
734 will be unresolved.
735 """
736 new_configs: dict[str, PipelineTaskConfig] = {}
737 for task_label, config_update in itertools.chain(args, kwargs.items()):
738 if new_configs.setdefault(task_label, config_update) is not config_update:
739 raise ValueError(f"Config for {task_label!r} provided more than once.")
740 updates = {
741 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged)
742 for task_label, config in new_configs.items()
743 }
744 self._replace_task_nodes(
745 updates,
746 check_edges_unchanged=check_edges_unchanged,
747 assume_edges_unchanged=assume_edges_unchanged,
748 message_header=(
749 "Unexpected change in edges for task {task_label!r} from original config (A) to "
750 "new configs (B):"
751 ),
752 )
754 def remove_tasks(
755 self, labels: Iterable[str], drop_from_subsets: bool = True
756 ) -> list[tuple[TaskNode, set[str]]]:
757 """Remove one or more tasks from the graph.
759 Parameters
760 ----------
761 labels : `~collections.abc.Iterable` [ `str` ]
762 Iterable of the labels of the tasks to remove.
763 drop_from_subsets : `bool`, optional
764 If `True`, drop each removed task from any subset in which it
765 currently appears. If `False`, raise `PipelineGraphError` if any
766 such subsets exist.
768 Returns
769 -------
770 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ]
771 List of nodes removed and the labels of task subsets that
772 referenced them.
774 Raises
775 ------
776 PipelineGraphError
777 Raised if ``drop_from_subsets`` is `False` and the task is still
778 part of one or more subsets.
780 Notes
781 -----
782 Removing a task will cause dataset nodes with no other referencing
783 tasks to be removed. Any other dataset type nodes referenced by a
784 removed task will be reset to an "unresolved" state.
785 """
786 task_nodes_and_subsets = []
787 dataset_types: set[NodeKey] = set()
788 nodes_to_remove = set()
789 for label in labels:
790 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"]
791 # Find task subsets that reference this task.
792 referencing_subsets = {
793 subset_label
794 for subset_label, task_subset in self.task_subsets.items()
795 if label in task_subset
796 }
797 if not drop_from_subsets and referencing_subsets:
798 raise PipelineGraphError(
799 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}."
800 )
801 task_nodes_and_subsets.append((task_node, referencing_subsets))
802 # Find dataset types referenced by this task.
803 dataset_types.update(self._xgraph.predecessors(task_node.key))
804 dataset_types.update(self._xgraph.successors(task_node.key))
805 dataset_types.update(self._xgraph.predecessors(task_node.init.key))
806 dataset_types.update(self._xgraph.successors(task_node.init.key))
807 # Since there's an edge between the task and its init node, we'll
808 # have added those two nodes here, too, and we don't want that.
809 dataset_types.remove(task_node.init.key)
810 dataset_types.remove(task_node.key)
811 # Mark the task node and its init node for removal from the graph.
812 nodes_to_remove.add(task_node.key)
813 nodes_to_remove.add(task_node.init.key)
814 # Process the referenced datasets to see which ones are orphaned and
815 # need to be removed vs. just unresolved.
816 nodes_to_unresolve = []
817 for dataset_type_key in dataset_types:
818 related_tasks = set()
819 related_tasks.update(self._xgraph.predecessors(dataset_type_key))
820 related_tasks.update(self._xgraph.successors(dataset_type_key))
821 related_tasks.difference_update(nodes_to_remove)
822 if not related_tasks:
823 nodes_to_remove.add(dataset_type_key)
824 else:
825 nodes_to_unresolve.append(dataset_type_key)
826 # Checks and preparation complete; time to start the actual
827 # modification, during which it's hard to provide strong exception
828 # safety. Start by resetting the sort ordering.
829 self._reset()
830 try:
831 for dataset_type_key in nodes_to_unresolve:
832 self._xgraph.nodes[dataset_type_key]["instance"] = None
833 for task_node, referencing_subsets in task_nodes_and_subsets:
834 for subset_label in referencing_subsets:
835 self._task_subsets[subset_label].remove(task_node.label)
836 self._xgraph.remove_nodes_from(nodes_to_remove)
837 except Exception as err: # pragma: no cover
838 # There's no known way to get here, but we want to make it
839 # clear it's a big problem if we do.
840 raise PipelineGraphExceptionSafetyError(
841 "Error during task removal has left the graph in an inconsistent state."
842 ) from err
843 return task_nodes_and_subsets
845 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None:
846 """Add a label for a set of tasks that are already in the pipeline.
848 Parameters
849 ----------
850 subset_label : `str`
851 Label for this set of tasks.
852 task_labels : `~collections.abc.Iterable` [ `str` ]
853 Labels of the tasks to include in the set. All must already be
854 included in the graph.
855 description : `str`, optional
856 String description to associate with this label.
857 """
858 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description)
859 self._task_subsets[subset_label] = subset
861 def remove_task_subset(self, subset_label: str) -> None:
862 """Remove a labeled set of tasks."""
863 del self._task_subsets[subset_label]
865 ###########################################################################
866 #
867 # NetworkX Export Interface:
868 #
869 # - methods to export the PipelineGraph's content (or various subsets
870 # thereof) as NetworkX objects.
871 #
872 # These are particularly useful when writing tools to visualize the graph,
873 # while providing options for which aspects of the graph (tasks, dataset
874 # types, or both) to include, since all exported graphs have similar
875 # attributes regardless of their structure.
876 #
877 ###########################################################################
879 def make_xgraph(self) -> networkx.MultiDiGraph:
880 """Export a networkx representation of the full pipeline graph,
881 including both init and runtime edges.
883 Returns
884 -------
885 xgraph : `networkx.MultiDiGraph`
886 Directed acyclic graph with parallel edges.
888 Notes
889 -----
890 The returned graph uses `NodeKey` instances for nodes. Parallel edges
891 represent the same dataset type appearing in multiple connections for
892 the same task, and are hence rare. The connection name is used as the
893 edge key to disambiguate those parallel edges.
895 Almost all edges connect dataset type nodes to task or task init nodes
896 or vice versa, but there is also a special edge that connects each task
897 init node to its runtime node. The existence of these edges makes the
898 graph not quite bipartite, though its init-only and runtime-only
899 subgraphs are bipartite.
901 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
902 `WriteEdge` for the descriptive node and edge attributes added.
903 """
904 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False)
906 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph:
907 """Return a bipartite networkx representation of just the runtime or
908 init-time pipeline graph.
910 Parameters
911 ----------
912 init : `bool`, optional
913 If `True` (`False` is default) return the graph of task
914 initialization nodes and init input/output dataset types, instead
915 of the graph of runtime task nodes and regular
916 input/output/prerequisite dataset types.
918 Returns
919 -------
920 xgraph : `networkx.MultiDiGraph`
921 Directed acyclic graph with parallel edges.
923 Notes
924 -----
925 The returned graph uses `NodeKey` instances for nodes. Parallel edges
926 represent the same dataset type appearing in multiple connections for
927 the same task, and are hence rare. The connection name is used as the
928 edge key to disambiguate those parallel edges.
930 This graph is bipartite because each dataset type node only has edges
931 that connect it to a task [init] node, and vice versa.
933 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and
934 `WriteEdge` for the descriptive node and edge attributes added.
935 """
936 return self._transform_xgraph_state(
937 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False
938 )
940 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph:
941 """Return a networkx representation of just the tasks in the pipeline.
943 Parameters
944 ----------
945 init : `bool`, optional
946 If `True` (`False` is default) return the graph of task
947 initialization nodes, instead of the graph of runtime task nodes.
949 Returns
950 -------
951 xgraph : `networkx.DiGraph`
952 Directed acyclic graph with no parallel edges.
954 Notes
955 -----
956 The returned graph uses `NodeKey` instances for nodes. The dataset
957 types that link these tasks are not represented at all; edges have no
958 attributes, and there are no parallel edges.
960 See `TaskNode` and `TaskInitNode` for the descriptive node and
961 attributes added.
962 """
963 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
964 task_keys = [
965 key
966 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
967 if bipartite == NodeType.TASK.bipartite
968 ]
969 return self._transform_xgraph_state(
970 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys),
971 skip_edges=True,
972 )
974 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph:
975 """Return a networkx representation of just the dataset types in the
976 pipeline.
978 Parameters
979 ----------
980 init : `bool`, optional
981 If `True` (`False` is default) return the graph of init input and
982 output dataset types, instead of the graph of runtime (input,
983 output, prerequisite input) dataset types.
985 Returns
986 -------
987 xgraph : `networkx.DiGraph`
988 Directed acyclic graph with no parallel edges.
990 Notes
991 -----
992 The returned graph uses `NodeKey` instances for nodes. The tasks that
993 link these tasks are not represented at all; edges have no attributes,
994 and there are no parallel edges.
996 See `DatasetTypeNode` for the descriptive node and attributes added.
997 """
998 bipartite_xgraph = self._make_bipartite_xgraph_internal(init)
999 dataset_type_keys = [
1000 key
1001 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1002 if bipartite == NodeType.DATASET_TYPE.bipartite
1003 ]
1004 return self._transform_xgraph_state(
1005 networkx.algorithms.bipartite.projected_graph(
1006 networkx.DiGraph(bipartite_xgraph), dataset_type_keys
1007 ),
1008 skip_edges=True,
1009 )
1011 ###########################################################################
1012 #
1013 # Serialization Interface.
1014 #
1015 # Serialization of PipelineGraphs is currently experimental and may not be
1016 # retained in the future. All serialization methods are
1017 # underscore-prefixed to ensure nobody mistakes them for a stable interface
1018 # (let a lone a stable file format).
1019 #
1020 ###########################################################################
1022 @classmethod
1023 def _read_stream(
1024 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1025 ) -> PipelineGraph:
1026 """Read a serialized `PipelineGraph` from a file-like object.
1028 Parameters
1029 ----------
1030 stream : `BinaryIO`
1031 File-like object opened for binary reading, containing
1032 gzip-compressed JSON.
1033 import_mode : `TaskImportMode`, optional
1034 Whether to import tasks, and how to reconcile any differences
1035 between the imported task's connections and the those that were
1036 persisted with the graph. Default is to check that they are the
1037 same.
1039 Returns
1040 -------
1041 graph : `PipelineGraph`
1042 Deserialized pipeline graph.
1044 Raises
1045 ------
1046 PipelineGraphReadError
1047 Raised if the serialized `PipelineGraph` is not self-consistent.
1048 EdgesChangedError
1049 Raised if ``import_mode`` is
1050 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1051 did change after import and reconfiguration.
1053 Notes
1054 -----
1055 `PipelineGraph` serialization is currently experimental and may be
1056 removed or significantly changed in the future, with no deprecation
1057 period.
1058 """
1059 from .io import SerializedPipelineGraph
1061 with gzip.open(stream, "rb") as uncompressed_stream:
1062 data = json.load(uncompressed_stream)
1063 serialized_graph = SerializedPipelineGraph.parse_obj(data)
1064 return serialized_graph.deserialize(import_mode)
1066 @classmethod
1067 def _read_uri(
1068 cls,
1069 uri: ResourcePathExpression,
1070 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES,
1071 ) -> PipelineGraph:
1072 """Read a serialized `PipelineGraph` from a file at a URI.
1074 Parameters
1075 ----------
1076 uri : convertible to `lsst.resources.ResourcePath`
1077 URI to a gzip-compressed JSON file containing a serialized pipeline
1078 graph.
1079 import_mode : `TaskImportMode`, optional
1080 Whether to import tasks, and how to reconcile any differences
1081 between the imported task's connections and the those that were
1082 persisted with the graph. Default is to check that they are the
1083 same.
1085 Returns
1086 -------
1087 graph : `PipelineGraph`
1088 Deserialized pipeline graph.
1090 Raises
1091 ------
1092 PipelineGraphReadError
1093 Raised if the serialized `PipelineGraph` is not self-consistent.
1094 EdgesChangedError
1095 Raised if ``import_mode`` is
1096 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1097 did change after import and reconfiguration.
1099 Notes
1100 -----
1101 `PipelineGraph` serialization is currently experimental and may be
1102 removed or significantly changed in the future, with no deprecation
1103 period.
1104 """
1105 uri = ResourcePath(uri)
1106 with uri.open("rb") as stream:
1107 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode)
1109 def _write_stream(self, stream: BinaryIO) -> None:
1110 """Write the pipeline to a file-like object.
1112 Parameters
1113 ----------
1114 stream
1115 File-like object opened for binary writing.
1117 Notes
1118 -----
1119 `PipelineGraph` serialization is currently experimental and may be
1120 removed or significantly changed in the future, with no deprecation
1121 period.
1123 The file format is gzipped JSON, and is intended to be human-readable,
1124 but it should not be considered a stable public interface for outside
1125 code, which should always use `PipelineGraph` methods (or at least the
1126 `io.SerializedPipelineGraph` class) to read these files.
1127 """
1128 from .io import SerializedPipelineGraph
1130 with gzip.open(stream, mode="wb") as compressed_stream:
1131 compressed_stream.write(
1132 SerializedPipelineGraph.serialize(self).json(exclude_defaults=True).encode("utf-8")
1133 )
1135 def _write_uri(self, uri: ResourcePathExpression) -> None:
1136 """Write the pipeline to a file given a URI.
1138 Parameters
1139 ----------
1140 uri : convertible to `lsst.resources.ResourcePath`
1141 URI to write to . May have ``.json.gz`` or no extension (which
1142 will cause a ``.json.gz`` extension to be added).
1144 Notes
1145 -----
1146 `PipelineGraph` serialization is currently experimental and may be
1147 removed or significantly changed in the future, with no deprecation
1148 period.
1150 The file format is gzipped JSON, and is intended to be human-readable,
1151 but it should not be considered a stable public interface for outside
1152 code, which should always use `PipelineGraph` methods (or at least the
1153 `io.SerializedPipelineGraph` class) to read these files.
1154 """
1155 uri = ResourcePath(uri)
1156 extension = uri.getExtension()
1157 if not extension:
1158 uri = uri.updatedExtension(".json.gz")
1159 elif extension != ".json.gz":
1160 raise ValueError("Expanded pipeline files should always have a .json.gz extension.")
1161 with uri.open(mode="wb") as stream:
1162 self._write_stream(cast(BinaryIO, stream))
1164 def _import_and_configure(
1165 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES
1166 ) -> None:
1167 """Import the `PipelineTask` classes referenced by all task nodes and
1168 update those nodes accordingly.
1170 Parameters
1171 ----------
1172 import_mode : `TaskImportMode`, optional
1173 Whether to import tasks, and how to reconcile any differences
1174 between the imported task's connections and the those that were
1175 persisted with the graph. Default is to check that they are the
1176 same. This method does nothing if this is
1177 `TaskImportMode.DO_NOT_IMPORT`.
1179 Raises
1180 ------
1181 EdgesChangedError
1182 Raised if ``import_mode`` is
1183 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task
1184 did change after import and reconfiguration.
1186 Notes
1187 -----
1188 This method shouldn't need to be called unless the graph was
1189 deserialized without importing and configuring immediately, which is
1190 not the default behavior (but it can greatly speed up deserialization).
1191 If all tasks have already been imported this does nothing.
1193 Importing and configuring a task can change its
1194 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output,
1195 usually because the software used to read a serialized graph is newer
1196 than the software used to write it (e.g. a new config option has been
1197 added, or the task was moved to a new module with a forwarding alias
1198 left behind). These changes are allowed by
1199 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`.
1201 If importing and configuring a task causes its edges to change, any
1202 dataset type nodes linked to those edges will be reset to the
1203 unresolved state.
1204 """
1205 if import_mode is TaskImportMode.DO_NOT_IMPORT:
1206 return
1207 rebuild = (
1208 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES
1209 or import_mode is TaskImportMode.OVERRIDE_EDGES
1210 )
1211 updates: dict[str, TaskNode] = {}
1212 node_key: NodeKey
1213 for node_key, node_state in self._xgraph.nodes.items():
1214 if node_key.node_type is NodeType.TASK:
1215 task_node: TaskNode = node_state["instance"]
1216 new_task_node = task_node._imported_and_configured(rebuild)
1217 if new_task_node is not task_node:
1218 updates[task_node.label] = new_task_node
1219 self._replace_task_nodes(
1220 updates,
1221 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES),
1222 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES),
1223 message_header=(
1224 "In task with label {task_label!r}, persisted edges (A)"
1225 "differ from imported and configured edges (B):"
1226 ),
1227 )
1229 ###########################################################################
1230 #
1231 # Advanced PipelineGraph Inspection Interface:
1232 #
1233 # - methods to iterate over all nodes and edges, utilizing NodeKeys;
1234 #
1235 # - methods to find overall inputs and group nodes by their dimensions,
1236 # which are important operations for QuantumGraph generation.
1237 #
1238 ###########################################################################
1240 def iter_edges(self, init: bool = False) -> Iterator[Edge]:
1241 """Iterate over edges in the graph.
1243 Parameters
1244 ----------
1245 init : `bool`, optional
1246 If `True` (`False` is default) iterate over the edges between task
1247 initialization node and init input/output dataset types, instead of
1248 the runtime task nodes and regular input/output/prerequisite
1249 dataset types.
1251 Returns
1252 -------
1253 edges : `~collections.abc.Iterator` [ `Edge` ]
1254 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances.
1256 Notes
1257 -----
1258 This method always returns _either_ init edges or runtime edges, never
1259 both. The full (internal) graph that contains both also includes a
1260 special edge that connects each task init node to its runtime node;
1261 that is also never returned by this method, since it is never a part of
1262 the init-only or runtime-only subgraphs.
1263 """
1264 edge: Edge
1265 for _, _, edge in self._xgraph.edges(data="instance"):
1266 if edge is not None and edge.is_init == init:
1267 yield edge
1269 def iter_nodes(
1270 self,
1271 ) -> Iterator[
1272 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode]
1273 | tuple[Literal[NodeType.TASK], str, TaskInitNode]
1274 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None]
1275 ]:
1276 """Iterate over nodes in the graph.
1278 Returns
1279 -------
1280 nodes : `~collections.abc.Iterator` [ `tuple` ]
1281 A lazy iterator over all of the nodes in the graph. Each yielded
1282 element is a tuple of:
1284 - the node type enum value (`NodeType`);
1285 - the string name for the node (task label or parent dataset type
1286 name);
1287 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`,
1288 or `None` for dataset type nodes that have not been resolved).
1289 """
1290 key: NodeKey
1291 if self._sorted_keys is not None:
1292 for key in self._sorted_keys:
1293 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore
1294 else:
1295 for key, node in self._xgraph.nodes(data="instance"):
1296 yield key.node_type, key.name, node # type: ignore
1298 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]:
1299 """Iterate over all of the dataset types that are consumed but not
1300 produced by the graph.
1302 Returns
1303 -------
1304 dataset_types : `~collections.abc.Iterator` [ `tuple` ]
1305 A lazy iterator over the overall-input dataset types (including
1306 overall init inputs and prerequisites). Each yielded element is a
1307 tuple of:
1309 - the parent dataset type name;
1310 - the resolved `DatasetTypeNode`, or `None` if the dataset type has
1311 - not been resolved.
1312 """
1313 for generation in networkx.algorithms.dag.topological_generations(self._xgraph):
1314 key: NodeKey
1315 for key in generation:
1316 # While we expect all tasks to have at least one input and
1317 # hence never appear in the first topological generation, that
1318 # is not true of task init nodes.
1319 if key.node_type is NodeType.DATASET_TYPE:
1320 yield key.name, self._xgraph.nodes[key]["instance"]
1321 return
1323 def group_by_dimensions(
1324 self, prerequisites: bool = False
1325 ) -> dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]:
1326 """Group this graph's tasks and dataset types by their dimensions.
1328 Parameters
1329 ----------
1330 prerequisites : `bool`, optional
1331 If `True`, include prerequisite dataset types as well as regular
1332 input and output datasets (including intermediates).
1334 Returns
1335 -------
1336 groups : `dict` [ `DimensionGraph`, `tuple` ]
1337 A dictionary of groups keyed by `DimensionGraph`, in which each
1338 value is a tuple of:
1340 - a `dict` of `TaskNode` instances, keyed by task label
1341 - a `dict` of `DatasetTypeNode` instances, keyed by
1342 dataset type name.
1344 that have those dimensions.
1346 Notes
1347 -----
1348 Init inputs and outputs are always included, but always have empty
1349 dimensions and are hence are all grouped together.
1350 """
1351 result: dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {}
1352 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {})
1353 for task_label, task_node in self.tasks.items():
1354 if task_node.dimensions is None:
1355 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.")
1356 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value:
1357 next_new_value = ({}, {}) # make new lists for next time
1358 group[0][task_node.label] = task_node
1359 for dataset_type_name, dataset_type_node in self.dataset_types.items():
1360 if dataset_type_node is None:
1361 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.")
1362 if not dataset_type_node.is_prerequisite or prerequisites:
1363 if (
1364 group := result.setdefault(dataset_type_node.dataset_type.dimensions, next_new_value)
1365 ) is next_new_value:
1366 next_new_value = ({}, {}) # make new lists for next time
1367 group[1][dataset_type_node.name] = dataset_type_node
1368 return result
1370 def split_independent(self) -> Iterable[PipelineGraph]:
1371 """Iterate over independent subgraphs that together comprise this
1372 pipeline graph.
1374 Returns
1375 -------
1376 subgraphs : `Iterable` [ `PipelineGraph` ]
1377 An iterable over component subgraphs that could be run
1378 independently (they have only overall inputs in common). May be a
1379 lazy iterator.
1381 Notes
1382 -----
1383 All resolved dataset type nodes will be preserved.
1385 If there is only one component, ``self`` may be returned as the only
1386 element in the iterable.
1388 If `has_been_sorted`, all subgraphs will be sorted as well.
1389 """
1390 # Having an overall input in common isn't enough to make subgraphs
1391 # dependent on each other, so we want to look for connected component
1392 # subgraphs of the task-only projected graph.
1393 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False)
1394 task_keys = {
1395 key
1396 for key, bipartite in bipartite_xgraph.nodes(data="bipartite")
1397 if bipartite == NodeType.TASK.bipartite
1398 }
1399 task_xgraph = networkx.algorithms.bipartite.projected_graph(
1400 networkx.DiGraph(bipartite_xgraph), task_keys
1401 )
1402 # "Weakly" connected means connected in only one direction, which is
1403 # the only kind of "connected" a DAG can ever be.
1404 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph):
1405 if component_task_keys == task_keys:
1406 yield self
1407 return
1408 else:
1409 component_subgraph = PipelineGraph(universe=self._universe)
1410 component_subgraph.add_task_nodes(
1411 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self
1412 )
1413 if self.has_been_sorted:
1414 component_subgraph.sort()
1415 yield component_subgraph
1417 ###########################################################################
1418 #
1419 # Class- and Package-Private Methods.
1420 #
1421 ###########################################################################
1423 def _iter_task_defs(self) -> Iterator[TaskDef]:
1424 """Iterate over this pipeline as a sequence of `TaskDef` instances.
1426 Notes
1427 -----
1428 This is a package-private method intended to aid in the transition to a
1429 codebase more fully integrated with the `PipelineGraph` class, in which
1430 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and
1431 much of the functionality on the `Pipeline` class will be moved to
1432 `PipelineGraph` as well.
1434 Raises
1435 ------
1436 TaskNotImportedError
1437 Raised if `TaskNode.is_imported` is `False` for any task.
1438 """
1439 from ..pipeline import TaskDef
1441 for node in self._tasks.values():
1442 yield TaskDef(
1443 config=node.config,
1444 taskClass=node.task_class,
1445 label=node.label,
1446 connections=node._get_imported_data().connections,
1447 )
1449 def _init_from_args(
1450 self,
1451 xgraph: networkx.MultiDiGraph | None,
1452 sorted_keys: Sequence[NodeKey] | None,
1453 task_subsets: dict[str, TaskSubset] | None,
1454 description: str,
1455 universe: DimensionUniverse | None,
1456 data_id: DataId | None,
1457 ) -> None:
1458 """Initialize the graph with possibly-nontrivial arguments.
1460 Parameters
1461 ----------
1462 xgraph : `networkx.MultiDiGraph` or `None`
1463 The backing networkx graph, or `None` to create an empty one.
1464 This graph has `NodeKey` instances for nodes and the same structure
1465 as the graph exported by `make_xgraph`, but its nodes and edges
1466 have a single ``instance`` attribute that holds a `TaskNode`,
1467 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or
1468 `WriteEdge` instance.
1469 sorted_keys : `Sequence` [ `NodeKey` ] or `None`
1470 Topologically sorted sequence of node keys, or `None` if the graph
1471 is not sorted.
1472 task_subsets : `dict` [ `str`, `TaskSubset` ]
1473 Labeled subsets of tasks. Values must be constructed with
1474 ``xgraph`` as their parent graph.
1475 description : `str`
1476 String description for this pipeline.
1477 universe : `lsst.daf.butler.DimensionUniverse` or `None`
1478 Definitions of all dimensions.
1479 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping.
1480 Data ID that represents a constraint on all quanta generated from
1481 this pipeline.
1483 Notes
1484 -----
1485 Only empty `PipelineGraph` instances should be constructed directly by
1486 users, which sets the signature of ``__init__`` itself, but methods on
1487 `PipelineGraph` and its helper classes need to be able to create them
1488 with state. Those methods can call this after calling ``__new__``
1489 manually, skipping ``__init__``.
1490 """
1491 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph()
1492 self._sorted_keys: Sequence[NodeKey] | None = None
1493 self._task_subsets = task_subsets if task_subsets is not None else {}
1494 self._description = description
1495 self._tasks = TaskMappingView(self._xgraph)
1496 self._dataset_types = DatasetTypeMappingView(self._xgraph)
1497 self._raw_data_id: dict[str, Any]
1498 if isinstance(data_id, DataCoordinate):
1499 if universe is None:
1500 universe = data_id.universe
1501 else:
1502 assert universe is data_id.universe, "data_id.universe and given universe differ"
1503 self._raw_data_id = data_id.byName()
1504 elif data_id is None:
1505 self._raw_data_id = {}
1506 else:
1507 self._raw_data_id = dict(data_id)
1508 self._universe = universe
1509 if sorted_keys is not None:
1510 self._reorder(sorted_keys)
1512 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph:
1513 """Make a bipartite init-only or runtime-only internal subgraph.
1515 See `make_bipartite_xgraph` for parameters and return values.
1517 Notes
1518 -----
1519 This method returns a view of the `PipelineGraph` object's internal
1520 backing graph, and hence should only be called in methods that copy the
1521 result either explicitly or by running a copying algorithm before
1522 returning it to the user.
1523 """
1524 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)])
1526 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G:
1527 """Transform networkx graph attributes in-place from the internal
1528 "instance" attributes to the documented exported attributes.
1530 Parameters
1531 ----------
1532 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1533 Graph whose state should be transformed.
1534 skip_edges : `bool`
1535 If `True`, do not transform edge state.
1537 Returns
1538 -------
1539 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph`
1540 The same object passed in, after modification.
1542 Notes
1543 -----
1544 This should be called after making a copy of the internal graph but
1545 before any projection down to just task or dataset type nodes, since
1546 it assumes stateful edges.
1547 """
1548 state: dict[str, Any]
1549 for state in xgraph.nodes.values():
1550 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance")
1551 if node_value is not None:
1552 state.update(node_value._to_xgraph_state())
1553 if not skip_edges:
1554 for _, _, state in xgraph.edges(data=True):
1555 edge: Edge | None = state.pop("instance", None)
1556 if edge is not None:
1557 state.update(edge._to_xgraph_state())
1558 return xgraph
1560 def _replace_task_nodes(
1561 self,
1562 updates: Mapping[str, TaskNode],
1563 check_edges_unchanged: bool,
1564 assume_edges_unchanged: bool,
1565 message_header: str,
1566 ) -> None:
1567 """Replace task nodes and update edges and dataset type nodes
1568 accordingly.
1570 Parameters
1571 ----------
1572 updates : `Mapping` [ `str`, `TaskNode` ]
1573 New task nodes with task label keys. All keys must be task labels
1574 that are already present in the graph.
1575 check_edges_unchanged : `bool`, optional
1576 If `True`, require the edges (connections) of the modified tasks to
1577 remain unchanged after importing and configuring each task, and
1578 verify that this is the case.
1579 assume_edges_unchanged : `bool`, optional
1580 If `True`, the caller declares that the edges (connections) of the
1581 modified tasks will remain unchanged importing and configuring each
1582 task, and that it is unnecessary to check this.
1583 message_header : `str`
1584 Template for `str.format` with a single ``task_label`` placeholder
1585 to use as the first line in `EdgesChangedError` messages that show
1586 the differences between new task edges and old task edges. Should
1587 include the fact that the rest of the message will refer to the old
1588 task as "A" and the new task as "B", and end with a colon.
1590 Raises
1591 ------
1592 ValueError
1593 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged``
1594 are both `True`, or if a full config is provided for a task after
1595 another full config or an override has already been provided.
1596 EdgesChangedError
1597 Raised if ``check_edges_unchanged=True`` and the edges of a task do
1598 change.
1599 """
1600 deep: dict[str, TaskNode] = {}
1601 shallow: dict[str, TaskNode] = {}
1602 if assume_edges_unchanged:
1603 if check_edges_unchanged:
1604 raise ValueError("Cannot simultaneously assume and check that edges have not changed.")
1605 shallow.update(updates)
1606 else:
1607 for task_label, new_task_node in updates.items():
1608 old_task_node = self.tasks[task_label]
1609 messages = old_task_node.diff_edges(new_task_node)
1610 if messages:
1611 if check_edges_unchanged:
1612 messages.insert(0, message_header.format(task_label=task_label))
1613 raise EdgesChangedError("\n".join(messages))
1614 else:
1615 deep[task_label] = new_task_node
1616 else:
1617 shallow[task_label] = new_task_node
1618 try:
1619 if deep:
1620 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True)
1621 self.add_task_nodes(deep.values())
1622 for replaced_task_node, referencing_subsets in removed:
1623 for subset_label in referencing_subsets:
1624 self._task_subsets[subset_label].add(replaced_task_node.label)
1625 for task_node in shallow.values():
1626 self._xgraph.nodes[task_node.key]["instance"] = task_node
1627 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init
1628 except PipelineGraphExceptionSafetyError: # pragma: no cover
1629 raise
1630 except Exception as err: # pragma: no cover
1631 # There's no known way to get here, but we want to make it clear
1632 # it's a big problem if we do.
1633 raise PipelineGraphExceptionSafetyError(
1634 "Error while replacing tasks has left the graph in an inconsistent state."
1635 ) from err
1637 def _append_graph_data_from_edge(
1638 self,
1639 node_data: list[tuple[NodeKey, dict[str, Any]]],
1640 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]],
1641 edge: Edge,
1642 parent: PipelineGraph | None,
1643 ) -> None:
1644 """Append networkx state dictionaries for an edge and the corresponding
1645 dataset type node.
1647 Parameters
1648 ----------
1649 node_data : `list`
1650 List of node keys and state dictionaries. A node is appended if
1651 one does not already exist for this dataset type.
1652 edge_data : `list`
1653 List of node key pairs, connection names, and state dictionaries
1654 for edges.
1655 edge : `Edge`
1656 New edge being processed.
1657 parent : `PipelineGraph` or `None`
1658 Another pipeline graph whose dataset type nodes should be used
1659 when present.
1660 """
1661 new_dataset_type_node = None
1662 if parent is not None:
1663 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance")
1664 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None:
1665 existing_dataset_type_state["instance"] = new_dataset_type_node
1666 else:
1667 node_data.append(
1668 (
1669 edge.dataset_type_key,
1670 {
1671 "instance": new_dataset_type_node,
1672 "bipartite": NodeType.DATASET_TYPE.bipartite,
1673 },
1674 )
1675 )
1676 edge_data.append(
1677 edge.nodes
1678 + (
1679 edge.connection_name,
1680 {"instance": edge},
1681 )
1682 )
1684 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None:
1685 """Set the order of all views of this graph from the given sorted
1686 sequence of task labels and dataset type names.
1687 """
1688 self._sorted_keys = sorted_keys
1689 self._tasks._reorder(sorted_keys)
1690 self._dataset_types._reorder(sorted_keys)
1692 def _reset(self) -> None:
1693 """Reset the all views of this graph following a modification that
1694 might invalidate them.
1695 """
1696 self._sorted_keys = None
1697 self._tasks._reset()
1698 self._dataset_types._reset()
1700 _xgraph: networkx.MultiDiGraph
1701 _sorted_keys: Sequence[NodeKey] | None
1702 _task_subsets: dict[str, TaskSubset]
1703 _description: str
1704 _tasks: TaskMappingView
1705 _dataset_types: DatasetTypeMappingView
1706 _raw_data_id: dict[str, Any]
1707 _universe: DimensionUniverse | None