Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 19%

373 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-31 09:39 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("PipelineGraph",) 

24 

25import gzip 

26import itertools 

27import json 

28from collections.abc import Iterable, Iterator, Mapping, Sequence 

29from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast 

30 

31import networkx 

32import networkx.algorithms.bipartite 

33import networkx.algorithms.dag 

34from lsst.daf.butler import DataCoordinate, DataId, DimensionGraph, DimensionUniverse, Registry 

35from lsst.resources import ResourcePath, ResourcePathExpression 

36 

37from ._dataset_types import DatasetTypeNode 

38from ._edges import Edge, ReadEdge, WriteEdge 

39from ._exceptions import ( 

40 DuplicateOutputError, 

41 EdgesChangedError, 

42 PipelineDataCycleError, 

43 PipelineGraphError, 

44 PipelineGraphExceptionSafetyError, 

45 UnresolvedGraphError, 

46) 

47from ._mapping_views import DatasetTypeMappingView, TaskMappingView 

48from ._nodes import NodeKey, NodeType 

49from ._task_subsets import TaskSubset 

50from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData 

51 

52if TYPE_CHECKING: 

53 from ..config import PipelineTaskConfig 

54 from ..connections import PipelineTaskConnections 

55 from ..pipeline import TaskDef 

56 from ..pipelineTask import PipelineTask 

57 

58 

59_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph) 

60 

61 

62class PipelineGraph: 

63 """A graph representation of fully-configured pipeline. 

64 

65 `PipelineGraph` instances are typically constructed by calling 

66 `.Pipeline.to_graph`, but in rare cases constructing and then populating an 

67 empty one may be preferable. 

68 

69 Parameters 

70 ---------- 

71 description : `str`, optional 

72 String description for this pipeline. 

73 universe : `lsst.daf.butler.DimensionUniverse`, optional 

74 Definitions for all butler dimensions. If not provided, some 

75 attributes will not be available until `resolve` is called. 

76 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional 

77 Data ID that represents a constraint on all quanta generated by this 

78 pipeline. This typically just holds the instrument constraint included 

79 in the pipeline definition, if there was one. 

80 """ 

81 

82 ########################################################################### 

83 # 

84 # Simple Pipeline Graph Inspection Interface: 

85 # 

86 # - for inspecting graph structure, not modifying it (except to sort and] 

87 # resolve); 

88 # 

89 # - no NodeKey objects, just string dataset type name and task label keys; 

90 # 

91 # - graph structure is represented as a pair of mappings, with methods to 

92 # find neighbors and edges of nodes. 

93 # 

94 ########################################################################### 

95 

96 def __init__( 

97 self, 

98 *, 

99 description: str = "", 

100 universe: DimensionUniverse | None = None, 

101 data_id: DataId | None = None, 

102 ) -> None: 

103 self._init_from_args( 

104 xgraph=None, 

105 sorted_keys=None, 

106 task_subsets=None, 

107 description=description, 

108 universe=universe, 

109 data_id=data_id, 

110 ) 

111 

112 def __repr__(self) -> str: 

113 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})" 

114 

115 @property 

116 def description(self) -> str: 

117 """String description for this pipeline.""" 

118 return self._description 

119 

120 @description.setter 

121 def description(self, value: str) -> None: 

122 # Docstring in setter. 

123 self._description = value 

124 

125 @property 

126 def universe(self) -> DimensionUniverse | None: 

127 """Definitions for all butler dimensions.""" 

128 return self._universe 

129 

130 @property 

131 def data_id(self) -> DataCoordinate: 

132 """Data ID that represents a constraint on all quanta generated from 

133 this pipeline. 

134 

135 This is may not be available unless `universe` is not `None`. 

136 """ 

137 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe) 

138 

139 @property 

140 def tasks(self) -> TaskMappingView: 

141 """A mapping view of the tasks in the graph. 

142 

143 This mapping has `str` task label keys and `TaskNode` values. Iteration 

144 is topologically and deterministically ordered if and only if `sort` 

145 has been called since the last modification to the graph. 

146 """ 

147 return self._tasks 

148 

149 @property 

150 def dataset_types(self) -> DatasetTypeMappingView: 

151 """A mapping view of the dataset types in the graph. 

152 

153 This mapping has `str` parent dataset type name keys, but only provides 

154 access to its `DatasetTypeNode` values if `resolve` has been called 

155 since the last modification involving a task that uses a dataset type. 

156 See `DatasetTypeMappingView` for details. 

157 """ 

158 return self._dataset_types 

159 

160 @property 

161 def task_subsets(self) -> Mapping[str, TaskSubset]: 

162 """A mapping of all labeled subsets of tasks. 

163 

164 Keys are subset labels, values are sets of task labels. See 

165 `TaskSubset` for more information. 

166 

167 Use `add_task_subset` to add a new subset. The subsets themselves may 

168 be modified in-place. 

169 """ 

170 return self._task_subsets 

171 

172 @property 

173 def is_sorted(self) -> bool: 

174 """Whether this graph's tasks and dataset types are topologically 

175 sorted with the exact same deterministic tiebreakers that `sort` would 

176 apply. 

177 

178 This may perform (and then discard) a full sort if `has_been_sorted` is 

179 `False`. If the goal is to obtain a sorted graph, it is better to just 

180 call `sort` without guarding that with an ``if not graph.is_sorted`` 

181 check. 

182 """ 

183 if self._sorted_keys is not None: 

184 return True 

185 return all( 

186 sorted == unsorted 

187 for sorted, unsorted in zip( 

188 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True 

189 ) 

190 ) 

191 

192 @property 

193 def has_been_sorted(self) -> bool: 

194 """Whether this graph's tasks and dataset types have been 

195 topologically sorted (with unspecified but deterministic tiebreakers) 

196 since the last modification to the graph. 

197 

198 This may return `False` if the graph *happens* to be sorted but `sort` 

199 was never called, but it is potentially much faster than `is_sorted`, 

200 which may attempt (and then discard) a full sort if `has_been_sorted` 

201 is `False`. 

202 """ 

203 return self._sorted_keys is not None 

204 

205 def sort(self) -> None: 

206 """Sort this graph's nodes topologically with deterministic (but 

207 unspecified) tiebreakers. 

208 

209 This does nothing if the graph is already known to be sorted. 

210 """ 

211 if self._sorted_keys is None: 

212 try: 

213 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph)) 

214 except networkx.NetworkXUnfeasible as err: # pragma: no cover 

215 # Should't be possible to get here, because we check for cycles 

216 # when adding tasks, but we guard against it anyway. 

217 cycle = networkx.find_cycle(self._xgraph) 

218 raise PipelineDataCycleError( 

219 f"Cycle detected while attempting to sort graph: {cycle}." 

220 ) from err 

221 self._reorder(sorted_keys) 

222 

223 def copy(self) -> PipelineGraph: 

224 """Return a copy of this graph that copies all mutable state.""" 

225 xgraph = self._xgraph.copy() 

226 result = PipelineGraph.__new__(PipelineGraph) 

227 result._init_from_args( 

228 xgraph, 

229 self._sorted_keys, 

230 task_subsets={ 

231 k: TaskSubset(xgraph, v.label, set(v._members), v.description) 

232 for k, v in self._task_subsets.items() 

233 }, 

234 description=self._description, 

235 universe=self.universe, 

236 data_id=self._raw_data_id, 

237 ) 

238 return result 

239 

240 def __copy__(self) -> PipelineGraph: 

241 # Fully shallow copies are dangerous; we don't want shared mutable 

242 # state to lead to broken class invariants. 

243 return self.copy() 

244 

245 def __deepcopy__(self, memo: dict) -> PipelineGraph: 

246 # Genuine deep copies are unnecessary, since we should only ever care 

247 # that mutable state is copied. 

248 return self.copy() 

249 

250 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None: 

251 """Return the `WriteEdge` that links the producing task to the named 

252 dataset type. 

253 

254 Parameters 

255 ---------- 

256 dataset_type_name : `str` 

257 Dataset type name. Must not be a component. 

258 

259 Returns 

260 ------- 

261 edge : `WriteEdge` or `None` 

262 Producing edge or `None` if there isn't one in this graph. 

263 

264 Raises 

265 ------ 

266 DuplicateOutputError 

267 Raised if there are multiple tasks defined to produce this dataset 

268 type. This is only possible if the graph's dataset types are not 

269 resolved. 

270 

271 Notes 

272 ----- 

273 On resolved graphs, it may be slightly more efficient to use:: 

274 

275 graph.dataset_types[dataset_type_name].producing_edge 

276 

277 but this method works on graphs with unresolved dataset types as well. 

278 """ 

279 producer: str | None = None 

280 producing_edge: WriteEdge | None = None 

281 for _, _, producing_edge in self._xgraph.in_edges( 

282 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

283 ): 

284 assert producing_edge is not None, "Should only be None if we never loop." 

285 if producer is not None: 

286 raise DuplicateOutputError( 

287 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} " 

288 f"and {producer!r}." 

289 ) 

290 return producing_edge 

291 

292 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]: 

293 """Return the `ReadEdge` objects that link the named dataset type to 

294 the tasks that consume it. 

295 

296 Parameters 

297 ---------- 

298 dataset_type_name : `str` 

299 Dataset type name. Must not be a component. 

300 

301 Returns 

302 ------- 

303 edges : `list` [ `ReadEdge` ] 

304 Edges that connect this dataset type to the tasks that consume it. 

305 

306 Notes 

307 ----- 

308 On resolved graphs, it may be slightly more efficient to use:: 

309 

310 graph.dataset_types[dataset_type_name].producing_edges 

311 

312 but this method works on graphs with unresolved dataset types as well. 

313 """ 

314 return [ 

315 edge 

316 for _, _, edge in self._xgraph.out_edges( 

317 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

318 ) 

319 ] 

320 

321 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None: 

322 """Return the `TaskNode` or `TaskInitNode` that writes the given 

323 dataset type. 

324 

325 Parameters 

326 ---------- 

327 dataset_type_name : `str` 

328 Dataset type name. Must not be a component. 

329 

330 Returns 

331 ------- 

332 edge : `TaskNode`, `TaskInitNode`, or `None` 

333 Producing node or `None` if there isn't one in this graph. 

334 

335 Raises 

336 ------ 

337 DuplicateOutputError 

338 Raised if there are multiple tasks defined to produce this dataset 

339 type. This is only possible if the graph's dataset types are not 

340 resolved. 

341 """ 

342 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None: 

343 return self._xgraph.nodes[producing_edge.task_key]["instance"] 

344 return None 

345 

346 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]: 

347 """Return the `TaskNode` and/or `TaskInitNode` objects that read 

348 the given dataset type. 

349 

350 Parameters 

351 ---------- 

352 dataset_type_name : `str` 

353 Dataset type name. Must not be a component. 

354 

355 Returns 

356 ------- 

357 edges : `list` [ `ReadEdge` ] 

358 Edges that connect this dataset type to the tasks that consume it. 

359 

360 Notes 

361 ----- 

362 On resolved graphs, it may be slightly more efficient to use:: 

363 

364 graph.dataset_types[dataset_type_name].producing_edges 

365 

366 but this method works on graphs with unresolved dataset types as well. 

367 """ 

368 return [ 

369 self._xgraph.nodes[consuming_edge.task_key]["instance"] 

370 for consuming_edge in self.consuming_edges_of(dataset_type_name) 

371 ] 

372 

373 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]: 

374 """Return the dataset types that are inputs to a task. 

375 

376 Parameters 

377 ---------- 

378 task_label : `str` 

379 Label for the task in the pipeline. 

380 init : `bool`, optional 

381 If `True`, return init-input dataset types instead of runtime 

382 (including prerequisite) inputs. 

383 

384 Returns 

385 ------- 

386 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

387 Dictionary parent dataset type name keys and either 

388 `DatasetTypeNode` values (if the dataset type has been resolved) 

389 or `None` values. 

390 

391 Notes 

392 ----- 

393 To get the input edges of a task or task init node (which provide 

394 information about storage class overrides nd components) use:: 

395 

396 graph.tasks[task_label].iter_all_inputs() 

397 

398 or 

399 

400 graph.tasks[task_label].init.iter_all_inputs() 

401 

402 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

403 class. 

404 """ 

405 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

406 return { 

407 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

408 for edge in node.iter_all_inputs() 

409 } 

410 

411 def outputs_of( 

412 self, task_label: str, init: bool = False, include_automatic_connections: bool = True 

413 ) -> dict[str, DatasetTypeNode | None]: 

414 """Return the dataset types that are outputs of a task. 

415 

416 Parameters 

417 ---------- 

418 task_label : `str` 

419 Label for the task in the pipeline. 

420 init : `bool`, optional 

421 If `True`, return init-output dataset types instead of runtime 

422 outputs. 

423 include_automatic_connections : `bool`, optional 

424 Whether to include automatic connections such as configs, metadata, 

425 and logs. 

426 

427 Returns 

428 ------- 

429 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

430 Dictionary parent dataset type name keys and either 

431 `DatasetTypeNode` values (if the dataset type has been resolved) 

432 or `None` values. 

433 

434 Notes 

435 ----- 

436 To get the input edges of a task or task init node (which provide 

437 information about storage class overrides nd components) use:: 

438 

439 graph.tasks[task_label].iter_all_outputs() 

440 

441 or 

442 

443 graph.tasks[task_label].init.iter_all_outputs() 

444 

445 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

446 class. 

447 """ 

448 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

449 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values() 

450 return { 

451 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

452 for edge in iterable 

453 } 

454 

455 def resolve(self, registry: Registry) -> None: 

456 """Resolve all dimensions and dataset types and check them for 

457 consistency. 

458 

459 Resolving a graph also causes it to be sorted. 

460 

461 Parameters 

462 ---------- 

463 registry : `lsst.daf.butler.Registry` 

464 Client for the data repository to resolve against. 

465 

466 Notes 

467 ----- 

468 The `universe` attribute is set to ``registry.dimensions`` and used to 

469 set all `TaskNode.dimensions` attributes. Dataset type nodes are 

470 resolved by first looking for a registry definition, then using the 

471 producing task's definition, then looking for consistency between all 

472 consuming task definitions. 

473 

474 Raises 

475 ------ 

476 ConnectionTypeConsistencyError 

477 Raised if a prerequisite input for one task appears as a different 

478 kind of connection in any other task. 

479 DuplicateOutputError 

480 Raised if multiple tasks have the same dataset type as an output. 

481 IncompatibleDatasetTypeError 

482 Raised if different tasks have different definitions of a dataset 

483 type. Different but compatible storage classes are permitted. 

484 MissingDatasetTypeError 

485 Raised if a dataset type definition is required to exist in the 

486 data repository but none was found. This should only occur for 

487 dataset types that are not produced by a task in the pipeline and 

488 are consumed with different storage classes or as components by 

489 tasks in the pipeline. 

490 EdgesChangedError 

491 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

492 change after import and reconfiguration. 

493 """ 

494 node_key: NodeKey 

495 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {} 

496 for node_key, node_state in self._xgraph.nodes.items(): 

497 match node_key.node_type: 

498 case NodeType.TASK: 

499 task_node: TaskNode = node_state["instance"] 

500 new_task_node = task_node._resolved(registry.dimensions) 

501 if new_task_node is not task_node: 

502 updates[node_key] = new_task_node 

503 case NodeType.DATASET_TYPE: 

504 dataset_type_node: DatasetTypeNode | None = node_state["instance"] 

505 new_dataset_type_node = DatasetTypeNode._from_edges( 

506 node_key, self._xgraph, registry, previous=dataset_type_node 

507 ) 

508 # Usage of `is`` here is intentional; `_from_edges` returns 

509 # `previous=dataset_type_node` if it can determine that it 

510 # doesn't need to change. 

511 if new_dataset_type_node is not dataset_type_node: 

512 updates[node_key] = new_dataset_type_node 

513 try: 

514 for node_key, node_value in updates.items(): 

515 self._xgraph.nodes[node_key]["instance"] = node_value 

516 except Exception as err: # pragma: no cover 

517 # There's no known way to get here, but we want to make it 

518 # clear it's a big problem if we do. 

519 raise PipelineGraphExceptionSafetyError( 

520 "Error during dataset type resolution has left the graph in an inconsistent state." 

521 ) from err 

522 self.sort() 

523 self._universe = registry.dimensions 

524 

525 ########################################################################### 

526 # 

527 # Graph Modification Interface: 

528 # 

529 # - methods to add, remove, and replace tasks; 

530 # 

531 # - methods to add and remove task subsets. 

532 # 

533 # These are all things that are usually done in a Pipeline before making a 

534 # graph at all, but there may be cases where we want to modify the graph 

535 # instead. (These are also the methods used to make a graph from a 

536 # Pipeline, or make a graph from another graph.) 

537 # 

538 ########################################################################### 

539 

540 def add_task( 

541 self, 

542 label: str, 

543 task_class: type[PipelineTask], 

544 config: PipelineTaskConfig, 

545 connections: PipelineTaskConnections | None = None, 

546 ) -> TaskNode: 

547 """Add a new task to the graph. 

548 

549 Parameters 

550 ---------- 

551 label : `str` 

552 Label for the task in the pipeline. 

553 task_class : `type` [ `PipelineTask` ] 

554 Class object for the task. 

555 config : `PipelineTaskConfig` 

556 Configuration for the task. 

557 connections : `PipelineTaskConnections`, optional 

558 Object that describes the dataset types used by the task. If not 

559 provided, one will be constructed from the given configuration. If 

560 provided, it is assumed that ``config`` has already been validated 

561 and frozen. 

562 

563 Returns 

564 ------- 

565 node : `TaskNode` 

566 The new task node added to the graph. 

567 

568 Raises 

569 ------ 

570 ValueError 

571 Raised if configuration validation failed when constructing 

572 ``connections``. 

573 PipelineDataCycleError 

574 Raised if the graph is cyclic after this addition. 

575 RuntimeError 

576 Raised if an unexpected exception (which will be chained) occurred 

577 at a stage that may have left the graph in an inconsistent state. 

578 Other exceptions should leave the graph unchanged. 

579 

580 Notes 

581 ----- 

582 Checks for dataset type consistency and multiple producers do not occur 

583 until `resolve` is called, since the resolution depends on both the 

584 state of the data repository and all contributing tasks. 

585 

586 Adding new tasks removes any existing resolutions of all dataset types 

587 it references and marks the graph as unsorted. It is most effiecient 

588 to add all tasks up front and only then resolve and/or sort the graph. 

589 """ 

590 task_node = TaskNode._from_imported_data( 

591 key=NodeKey(NodeType.TASK, label), 

592 init_key=NodeKey(NodeType.TASK_INIT, label), 

593 data=_TaskNodeImportedData.configure(label, task_class, config, connections), 

594 universe=self.universe, 

595 ) 

596 self.add_task_nodes([task_node]) 

597 return task_node 

598 

599 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None: 

600 """Add one or more existing task nodes to the graph. 

601 

602 Parameters 

603 ---------- 

604 nodes : `~collections.abc.Iterable` [ `TaskNode` ] 

605 Iterable of task nodes to add. If any tasks have resolved 

606 dimensions, they must have the same dimension universe as the rest 

607 of the graph. 

608 parent : `PipelineGraph`, optional 

609 If provided, another `PipelineGraph` from which these nodes were 

610 obtained. Any dataset type nodes already present in ``parent`` 

611 that are referenced by the given tasks will be used in this graph 

612 if they are not already present, preserving any dataset type 

613 resolutions present in the parent graph. Adding nodes from a 

614 parent graph after the graph has its own nodes (e.g. from 

615 `add_task`) or nodes from a third graph may result in invalid 

616 dataset type resolutions. It is safest to only use this argument 

617 when populating an empty graph for the first time. 

618 

619 Raises 

620 ------ 

621 PipelineDataCycleError 

622 Raised if the graph is cyclic after this addition. 

623 

624 Notes 

625 ----- 

626 Checks for dataset type consistency and multiple producers do not occur 

627 until `resolve` is called, since the resolution depends on both the 

628 state of the data repository and all contributing tasks. 

629 

630 Adding new tasks removes any existing resolutions of all dataset types 

631 it references (unless ``parent is not None`` and marks the graph as 

632 unsorted. It is most efficient to add all tasks up front and only then 

633 resolve and/or sort the graph. 

634 """ 

635 node_data: list[tuple[NodeKey, dict[str, Any]]] = [] 

636 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = [] 

637 for task_node in nodes: 

638 task_node = task_node._resolved(self._universe) 

639 node_data.append( 

640 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite}) 

641 ) 

642 node_data.append( 

643 ( 

644 task_node.init.key, 

645 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite}, 

646 ) 

647 ) 

648 # Convert the edge objects attached to the task node to networkx. 

649 for read_edge in task_node.init.iter_all_inputs(): 

650 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

651 for write_edge in task_node.init.iter_all_outputs(): 

652 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

653 for read_edge in task_node.iter_all_inputs(): 

654 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

655 for write_edge in task_node.iter_all_outputs(): 

656 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

657 # Add a special edge (with no Edge instance) that connects the 

658 # TaskInitNode to the runtime TaskNode. 

659 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None})) 

660 if not node_data and not edge_data: 

661 return 

662 # Checks and preparation complete; time to start the actual 

663 # modification, during which it's hard to provide strong exception 

664 # safety. Start by resetting the sort ordering, if there is one. 

665 self._reset() 

666 try: 

667 self._xgraph.add_nodes_from(node_data) 

668 self._xgraph.add_edges_from(edge_data) 

669 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph): 

670 cycle = networkx.find_cycle(self._xgraph) 

671 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.") 

672 except Exception: 

673 # First try to roll back our changes. 

674 try: 

675 self._xgraph.remove_edges_from(edge_data) 

676 self._xgraph.remove_nodes_from(key for key, _ in node_data) 

677 except Exception as err: # pragma: no cover 

678 # There's no known way to get here, but we want to make it 

679 # clear it's a big problem if we do. 

680 raise PipelineGraphExceptionSafetyError( 

681 "Error while attempting to revert PipelineGraph modification has left the graph in " 

682 "an inconsistent state." 

683 ) from err 

684 # Successfully rolled back; raise the original exception. 

685 raise 

686 

687 def reconfigure_tasks( 

688 self, 

689 *args: tuple[str, PipelineTaskConfig], 

690 check_edges_unchanged: bool = False, 

691 assume_edges_unchanged: bool = False, 

692 **kwargs: PipelineTaskConfig, 

693 ) -> None: 

694 """Update the configuration for one or more tasks. 

695 

696 Parameters 

697 ---------- 

698 *args : `tuple` [ `str`, `.PipelineTaskConfig` ] 

699 Positional arguments are each a 2-tuple of task label and new 

700 config object. Note that the same arguments may also be passed as 

701 ``**kwargs``, which is usually more readable, but task labels in 

702 ``*args`` are not required to be valid Python identifiers. 

703 check_edges_unchanged : `bool`, optional 

704 If `True`, require the edges (connections) of the modified tasks to 

705 remain unchanged after the configuration updates, and verify that 

706 this is the case. 

707 assume_edges_unchanged : `bool`, optional 

708 If `True`, the caller declares that the edges (connections) of the 

709 modified tasks will remain unchanged after the configuration 

710 updates, and that it is unnecessary to check this. 

711 **kwargs : `.PipelineTaskConfig` 

712 New config objects or overrides to apply to copies of the current 

713 config objects, with task labels as the keywords. 

714 

715 Raises 

716 ------ 

717 ValueError 

718 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

719 are both `True`, or if the same task appears twice. 

720 EdgesChangedError 

721 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

722 change. 

723 

724 Notes 

725 ----- 

726 If reconfiguring a task causes its edges to change, any dataset type 

727 nodes connected to that task (not just those whose edges have changed!) 

728 will be unresolved. 

729 """ 

730 new_configs: dict[str, PipelineTaskConfig] = {} 

731 for task_label, config_update in itertools.chain(args, kwargs.items()): 

732 if new_configs.setdefault(task_label, config_update) is not config_update: 

733 raise ValueError(f"Config for {task_label!r} provided more than once.") 

734 updates = { 

735 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged) 

736 for task_label, config in new_configs.items() 

737 } 

738 self._replace_task_nodes( 

739 updates, 

740 check_edges_unchanged=check_edges_unchanged, 

741 assume_edges_unchanged=assume_edges_unchanged, 

742 message_header=( 

743 "Unexpected change in edges for task {task_label!r} from original config (A) to " 

744 "new configs (B):" 

745 ), 

746 ) 

747 

748 def remove_tasks( 

749 self, labels: Iterable[str], drop_from_subsets: bool = True 

750 ) -> list[tuple[TaskNode, set[str]]]: 

751 """Remove one or more tasks from the graph. 

752 

753 Parameters 

754 ---------- 

755 labels : `~collections.abc.Iterable` [ `str` ] 

756 Iterable of the labels of the tasks to remove. 

757 drop_from_subsets : `bool`, optional 

758 If `True`, drop each removed task from any subset in which it 

759 currently appears. If `False`, raise `PipelineGraphError` if any 

760 such subsets exist. 

761 

762 Returns 

763 ------- 

764 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ] 

765 List of nodes removed and the labels of task subsets that 

766 referenced them. 

767 

768 Raises 

769 ------ 

770 PipelineGraphError 

771 Raised if ``drop_from_subsets`` is `False` and the task is still 

772 part of one or more subsets. 

773 

774 Notes 

775 ----- 

776 Removing a task will cause dataset nodes with no other referencing 

777 tasks to be removed. Any other dataset type nodes referenced by a 

778 removed task will be reset to an "unresolved" state. 

779 """ 

780 task_nodes_and_subsets = [] 

781 dataset_types: set[NodeKey] = set() 

782 nodes_to_remove = set() 

783 for label in labels: 

784 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"] 

785 # Find task subsets that reference this task. 

786 referencing_subsets = { 

787 subset_label 

788 for subset_label, task_subset in self.task_subsets.items() 

789 if label in task_subset 

790 } 

791 if not drop_from_subsets and referencing_subsets: 

792 raise PipelineGraphError( 

793 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}." 

794 ) 

795 task_nodes_and_subsets.append((task_node, referencing_subsets)) 

796 # Find dataset types referenced by this task. 

797 dataset_types.update(self._xgraph.predecessors(task_node.key)) 

798 dataset_types.update(self._xgraph.successors(task_node.key)) 

799 dataset_types.update(self._xgraph.predecessors(task_node.init.key)) 

800 dataset_types.update(self._xgraph.successors(task_node.init.key)) 

801 # Since there's an edge between the task and its init node, we'll 

802 # have added those two nodes here, too, and we don't want that. 

803 dataset_types.remove(task_node.init.key) 

804 dataset_types.remove(task_node.key) 

805 # Mark the task node and its init node for removal from the graph. 

806 nodes_to_remove.add(task_node.key) 

807 nodes_to_remove.add(task_node.init.key) 

808 # Process the referenced datasets to see which ones are orphaned and 

809 # need to be removed vs. just unresolved. 

810 nodes_to_unresolve = [] 

811 for dataset_type_key in dataset_types: 

812 related_tasks = set() 

813 related_tasks.update(self._xgraph.predecessors(dataset_type_key)) 

814 related_tasks.update(self._xgraph.successors(dataset_type_key)) 

815 related_tasks.difference_update(nodes_to_remove) 

816 if not related_tasks: 

817 nodes_to_remove.add(dataset_type_key) 

818 else: 

819 nodes_to_unresolve.append(dataset_type_key) 

820 # Checks and preparation complete; time to start the actual 

821 # modification, during which it's hard to provide strong exception 

822 # safety. Start by resetting the sort ordering. 

823 self._reset() 

824 try: 

825 for dataset_type_key in nodes_to_unresolve: 

826 self._xgraph.nodes[dataset_type_key]["instance"] = None 

827 for task_node, referencing_subsets in task_nodes_and_subsets: 

828 for subset_label in referencing_subsets: 

829 self._task_subsets[subset_label].remove(task_node.label) 

830 self._xgraph.remove_nodes_from(nodes_to_remove) 

831 except Exception as err: # pragma: no cover 

832 # There's no known way to get here, but we want to make it 

833 # clear it's a big problem if we do. 

834 raise PipelineGraphExceptionSafetyError( 

835 "Error during task removal has left the graph in an inconsistent state." 

836 ) from err 

837 return task_nodes_and_subsets 

838 

839 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None: 

840 """Add a label for a set of tasks that are already in the pipeline. 

841 

842 Parameters 

843 ---------- 

844 subset_label : `str` 

845 Label for this set of tasks. 

846 task_labels : `~collections.abc.Iterable` [ `str` ] 

847 Labels of the tasks to include in the set. All must already be 

848 included in the graph. 

849 description : `str`, optional 

850 String description to associate with this label. 

851 """ 

852 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description) 

853 self._task_subsets[subset_label] = subset 

854 

855 def remove_task_subset(self, subset_label: str) -> None: 

856 """Remove a labeled set of tasks.""" 

857 del self._task_subsets[subset_label] 

858 

859 ########################################################################### 

860 # 

861 # NetworkX Export Interface: 

862 # 

863 # - methods to export the PipelineGraph's content (or various subsets 

864 # thereof) as NetworkX objects. 

865 # 

866 # These are particularly useful when writing tools to visualize the graph, 

867 # while providing options for which aspects of the graph (tasks, dataset 

868 # types, or both) to include, since all exported graphs have similar 

869 # attributes regardless of their structure. 

870 # 

871 ########################################################################### 

872 

873 def make_xgraph(self) -> networkx.MultiDiGraph: 

874 """Export a networkx representation of the full pipeline graph, 

875 including both init and runtime edges. 

876 

877 Returns 

878 ------- 

879 xgraph : `networkx.MultiDiGraph` 

880 Directed acyclic graph with parallel edges. 

881 

882 Notes 

883 ----- 

884 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

885 represent the same dataset type appearing in multiple connections for 

886 the same task, and are hence rare. The connection name is used as the 

887 edge key to disambiguate those parallel edges. 

888 

889 Almost all edges connect dataset type nodes to task or task init nodes 

890 or vice versa, but there is also a special edge that connects each task 

891 init node to its runtime node. The existence of these edges makes the 

892 graph not quite bipartite, though its init-only and runtime-only 

893 subgraphs are bipartite. 

894 

895 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

896 `WriteEdge` for the descriptive node and edge attributes added. 

897 """ 

898 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False) 

899 

900 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph: 

901 """Return a bipartite networkx representation of just the runtime or 

902 init-time pipeline graph. 

903 

904 Parameters 

905 ---------- 

906 init : `bool`, optional 

907 If `True` (`False` is default) return the graph of task 

908 initialization nodes and init input/output dataset types, instead 

909 of the graph of runtime task nodes and regular 

910 input/output/prerequisite dataset types. 

911 

912 Returns 

913 ------- 

914 xgraph : `networkx.MultiDiGraph` 

915 Directed acyclic graph with parallel edges. 

916 

917 Notes 

918 ----- 

919 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

920 represent the same dataset type appearing in multiple connections for 

921 the same task, and are hence rare. The connection name is used as the 

922 edge key to disambiguate those parallel edges. 

923 

924 This graph is bipartite because each dataset type node only has edges 

925 that connect it to a task [init] node, and vice versa. 

926 

927 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

928 `WriteEdge` for the descriptive node and edge attributes added. 

929 """ 

930 return self._transform_xgraph_state( 

931 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False 

932 ) 

933 

934 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph: 

935 """Return a networkx representation of just the tasks in the pipeline. 

936 

937 Parameters 

938 ---------- 

939 init : `bool`, optional 

940 If `True` (`False` is default) return the graph of task 

941 initialization nodes, instead of the graph of runtime task nodes. 

942 

943 Returns 

944 ------- 

945 xgraph : `networkx.DiGraph` 

946 Directed acyclic graph with no parallel edges. 

947 

948 Notes 

949 ----- 

950 The returned graph uses `NodeKey` instances for nodes. The dataset 

951 types that link these tasks are not represented at all; edges have no 

952 attributes, and there are no parallel edges. 

953 

954 See `TaskNode` and `TaskInitNode` for the descriptive node and 

955 attributes added. 

956 """ 

957 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

958 task_keys = [ 

959 key 

960 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

961 if bipartite == NodeType.TASK.bipartite 

962 ] 

963 return self._transform_xgraph_state( 

964 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys), 

965 skip_edges=True, 

966 ) 

967 

968 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph: 

969 """Return a networkx representation of just the dataset types in the 

970 pipeline. 

971 

972 Parameters 

973 ---------- 

974 init : `bool`, optional 

975 If `True` (`False` is default) return the graph of init input and 

976 output dataset types, instead of the graph of runtime (input, 

977 output, prerequisite input) dataset types. 

978 

979 Returns 

980 ------- 

981 xgraph : `networkx.DiGraph` 

982 Directed acyclic graph with no parallel edges. 

983 

984 Notes 

985 ----- 

986 The returned graph uses `NodeKey` instances for nodes. The tasks that 

987 link these tasks are not represented at all; edges have no attributes, 

988 and there are no parallel edges. 

989 

990 See `DatasetTypeNode` for the descriptive node and attributes added. 

991 """ 

992 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

993 dataset_type_keys = [ 

994 key 

995 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

996 if bipartite == NodeType.DATASET_TYPE.bipartite 

997 ] 

998 return self._transform_xgraph_state( 

999 networkx.algorithms.bipartite.projected_graph( 

1000 networkx.DiGraph(bipartite_xgraph), dataset_type_keys 

1001 ), 

1002 skip_edges=True, 

1003 ) 

1004 

1005 ########################################################################### 

1006 # 

1007 # Serialization Interface. 

1008 # 

1009 # Serialization of PipelineGraphs is currently experimental and may not be 

1010 # retained in the future. All serialization methods are 

1011 # underscore-prefixed to ensure nobody mistakes them for a stable interface 

1012 # (let a lone a stable file format). 

1013 # 

1014 ########################################################################### 

1015 

1016 @classmethod 

1017 def _read_stream( 

1018 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1019 ) -> PipelineGraph: 

1020 """Read a serialized `PipelineGraph` from a file-like object. 

1021 

1022 Parameters 

1023 ---------- 

1024 stream : `BinaryIO` 

1025 File-like object opened for binary reading, containing 

1026 gzip-compressed JSON. 

1027 import_mode : `TaskImportMode`, optional 

1028 Whether to import tasks, and how to reconcile any differences 

1029 between the imported task's connections and the those that were 

1030 persisted with the graph. Default is to check that they are the 

1031 same. 

1032 

1033 Returns 

1034 ------- 

1035 graph : `PipelineGraph` 

1036 Deserialized pipeline graph. 

1037 

1038 Raises 

1039 ------ 

1040 PipelineGraphReadError 

1041 Raised if the serialized `PipelineGraph` is not self-consistent. 

1042 EdgesChangedError 

1043 Raised if ``import_mode`` is 

1044 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1045 did change after import and reconfiguration. 

1046 

1047 Notes 

1048 ----- 

1049 `PipelineGraph` serialization is currently experimental and may be 

1050 removed or significantly changed in the future, with no deprecation 

1051 period. 

1052 """ 

1053 from .io import SerializedPipelineGraph 

1054 

1055 with gzip.open(stream, "rb") as uncompressed_stream: 

1056 data = json.load(uncompressed_stream) 

1057 serialized_graph = SerializedPipelineGraph.parse_obj(data) 

1058 return serialized_graph.deserialize(import_mode) 

1059 

1060 @classmethod 

1061 def _read_uri( 

1062 cls, 

1063 uri: ResourcePathExpression, 

1064 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES, 

1065 ) -> PipelineGraph: 

1066 """Read a serialized `PipelineGraph` from a file at a URI. 

1067 

1068 Parameters 

1069 ---------- 

1070 uri : convertible to `lsst.resources.ResourcePath` 

1071 URI to a gzip-compressed JSON file containing a serialized pipeline 

1072 graph. 

1073 import_mode : `TaskImportMode`, optional 

1074 Whether to import tasks, and how to reconcile any differences 

1075 between the imported task's connections and the those that were 

1076 persisted with the graph. Default is to check that they are the 

1077 same. 

1078 

1079 Returns 

1080 ------- 

1081 graph : `PipelineGraph` 

1082 Deserialized pipeline graph. 

1083 

1084 Raises 

1085 ------ 

1086 PipelineGraphReadError 

1087 Raised if the serialized `PipelineGraph` is not self-consistent. 

1088 EdgesChangedError 

1089 Raised if ``import_mode`` is 

1090 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1091 did change after import and reconfiguration. 

1092 

1093 Notes 

1094 ----- 

1095 `PipelineGraph` serialization is currently experimental and may be 

1096 removed or significantly changed in the future, with no deprecation 

1097 period. 

1098 """ 

1099 uri = ResourcePath(uri) 

1100 with uri.open("rb") as stream: 

1101 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode) 

1102 

1103 def _write_stream(self, stream: BinaryIO) -> None: 

1104 """Write the pipeline to a file-like object. 

1105 

1106 Parameters 

1107 ---------- 

1108 stream 

1109 File-like object opened for binary writing. 

1110 

1111 Notes 

1112 ----- 

1113 `PipelineGraph` serialization is currently experimental and may be 

1114 removed or significantly changed in the future, with no deprecation 

1115 period. 

1116 

1117 The file format is gzipped JSON, and is intended to be human-readable, 

1118 but it should not be considered a stable public interface for outside 

1119 code, which should always use `PipelineGraph` methods (or at least the 

1120 `io.SerializedPipelineGraph` class) to read these files. 

1121 """ 

1122 from .io import SerializedPipelineGraph 

1123 

1124 with gzip.open(stream, mode="wb") as compressed_stream: 

1125 compressed_stream.write( 

1126 SerializedPipelineGraph.serialize(self).json(exclude_defaults=True).encode("utf-8") 

1127 ) 

1128 

1129 def _write_uri(self, uri: ResourcePathExpression) -> None: 

1130 """Write the pipeline to a file given a URI. 

1131 

1132 Parameters 

1133 ---------- 

1134 uri : convertible to `lsst.resources.ResourcePath` 

1135 URI to write to . May have ``.json.gz`` or no extension (which 

1136 will cause a ``.json.gz`` extension to be added). 

1137 

1138 Notes 

1139 ----- 

1140 `PipelineGraph` serialization is currently experimental and may be 

1141 removed or significantly changed in the future, with no deprecation 

1142 period. 

1143 

1144 The file format is gzipped JSON, and is intended to be human-readable, 

1145 but it should not be considered a stable public interface for outside 

1146 code, which should always use `PipelineGraph` methods (or at least the 

1147 `io.SerializedPipelineGraph` class) to read these files. 

1148 """ 

1149 uri = ResourcePath(uri) 

1150 extension = uri.getExtension() 

1151 if not extension: 

1152 uri = uri.updatedExtension(".json.gz") 

1153 elif extension != ".json.gz": 

1154 raise ValueError("Expanded pipeline files should always have a .json.gz extension.") 

1155 with uri.open(mode="wb") as stream: 

1156 self._write_stream(cast(BinaryIO, stream)) 

1157 

1158 def _import_and_configure( 

1159 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1160 ) -> None: 

1161 """Import the `PipelineTask` classes referenced by all task nodes and 

1162 update those nodes accordingly. 

1163 

1164 Parameters 

1165 ---------- 

1166 import_mode : `TaskImportMode`, optional 

1167 Whether to import tasks, and how to reconcile any differences 

1168 between the imported task's connections and the those that were 

1169 persisted with the graph. Default is to check that they are the 

1170 same. This method does nothing if this is 

1171 `TaskImportMode.DO_NOT_IMPORT`. 

1172 

1173 Raises 

1174 ------ 

1175 EdgesChangedError 

1176 Raised if ``import_mode`` is 

1177 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1178 did change after import and reconfiguration. 

1179 

1180 Notes 

1181 ----- 

1182 This method shouldn't need to be called unless the graph was 

1183 deserialized without importing and configuring immediately, which is 

1184 not the default behavior (but it can greatly speed up deserialization). 

1185 If all tasks have already been imported this does nothing. 

1186 

1187 Importing and configuring a task can change its 

1188 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output, 

1189 usually because the software used to read a serialized graph is newer 

1190 than the software used to write it (e.g. a new config option has been 

1191 added, or the task was moved to a new module with a forwarding alias 

1192 left behind). These changes are allowed by 

1193 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`. 

1194 

1195 If importing and configuring a task causes its edges to change, any 

1196 dataset type nodes linked to those edges will be reset to the 

1197 unresolved state. 

1198 """ 

1199 if import_mode is TaskImportMode.DO_NOT_IMPORT: 

1200 return 

1201 rebuild = ( 

1202 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1203 or import_mode is TaskImportMode.OVERRIDE_EDGES 

1204 ) 

1205 updates: dict[str, TaskNode] = {} 

1206 node_key: NodeKey 

1207 for node_key, node_state in self._xgraph.nodes.items(): 

1208 if node_key.node_type is NodeType.TASK: 

1209 task_node: TaskNode = node_state["instance"] 

1210 new_task_node = task_node._imported_and_configured(rebuild) 

1211 if new_task_node is not task_node: 

1212 updates[task_node.label] = new_task_node 

1213 self._replace_task_nodes( 

1214 updates, 

1215 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES), 

1216 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES), 

1217 message_header=( 

1218 "In task with label {task_label!r}, persisted edges (A)" 

1219 "differ from imported and configured edges (B):" 

1220 ), 

1221 ) 

1222 

1223 ########################################################################### 

1224 # 

1225 # Advanced PipelineGraph Inspection Interface: 

1226 # 

1227 # - methods to iterate over all nodes and edges, utilizing NodeKeys; 

1228 # 

1229 # - methods to find overall inputs and group nodes by their dimensions, 

1230 # which are important operations for QuantumGraph generation. 

1231 # 

1232 ########################################################################### 

1233 

1234 def iter_edges(self, init: bool = False) -> Iterator[Edge]: 

1235 """Iterate over edges in the graph. 

1236 

1237 Parameters 

1238 ---------- 

1239 init : `bool`, optional 

1240 If `True` (`False` is default) iterate over the edges between task 

1241 initialization node and init input/output dataset types, instead of 

1242 the runtime task nodes and regular input/output/prerequisite 

1243 dataset types. 

1244 

1245 Returns 

1246 ------- 

1247 edges : `~collections.abc.Iterator` [ `Edge` ] 

1248 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances. 

1249 

1250 Notes 

1251 ----- 

1252 This method always returns _either_ init edges or runtime edges, never 

1253 both. The full (internal) graph that contains both also includes a 

1254 special edge that connects each task init node to its runtime node; 

1255 that is also never returned by this method, since it is never a part of 

1256 the init-only or runtime-only subgraphs. 

1257 """ 

1258 edge: Edge 

1259 for _, _, edge in self._xgraph.edges(data="instance"): 

1260 if edge is not None and edge.is_init == init: 

1261 yield edge 

1262 

1263 def iter_nodes( 

1264 self, 

1265 ) -> Iterator[ 

1266 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode] 

1267 | tuple[Literal[NodeType.TASK], str, TaskInitNode] 

1268 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None] 

1269 ]: 

1270 """Iterate over nodes in the graph. 

1271 

1272 Returns 

1273 ------- 

1274 nodes : `~collections.abc.Iterator` [ `tuple` ] 

1275 A lazy iterator over all of the nodes in the graph. Each yielded 

1276 element is a tuple of: 

1277 

1278 - the node type enum value (`NodeType`); 

1279 - the string name for the node (task label or parent dataset type 

1280 name); 

1281 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`, 

1282 or `None` for dataset type nodes that have not been resolved). 

1283 """ 

1284 key: NodeKey 

1285 if self._sorted_keys is not None: 

1286 for key in self._sorted_keys: 

1287 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore 

1288 else: 

1289 for key, node in self._xgraph.nodes(data="instance"): 

1290 yield key.node_type, key.name, node # type: ignore 

1291 

1292 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]: 

1293 """Iterate over all of the dataset types that are consumed but not 

1294 produced by the graph. 

1295 

1296 Returns 

1297 ------- 

1298 dataset_types : `~collections.abc.Iterator` [ `tuple` ] 

1299 A lazy iterator over the overall-input dataset types (including 

1300 overall init inputs and prerequisites). Each yielded element is a 

1301 tuple of: 

1302 

1303 - the parent dataset type name; 

1304 - the resolved `DatasetTypeNode`, or `None` if the dataset type has 

1305 - not been resolved. 

1306 """ 

1307 for generation in networkx.algorithms.dag.topological_generations(self._xgraph): 

1308 key: NodeKey 

1309 for key in generation: 

1310 # While we expect all tasks to have at least one input and 

1311 # hence never appear in the first topological generation, that 

1312 # is not true of task init nodes. 

1313 if key.node_type is NodeType.DATASET_TYPE: 

1314 yield key.name, self._xgraph.nodes[key]["instance"] 

1315 return 

1316 

1317 def group_by_dimensions( 

1318 self, prerequisites: bool = False 

1319 ) -> dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]: 

1320 """Group this graph's tasks and dataset types by their dimensions. 

1321 

1322 Parameters 

1323 ---------- 

1324 prerequisites : `bool`, optional 

1325 If `True`, include prerequisite dataset types as well as regular 

1326 input and output datasets (including intermediates). 

1327 

1328 Returns 

1329 ------- 

1330 groups : `dict` [ `DimensionGraph`, `tuple` ] 

1331 A dictionary of groups keyed by `DimensionGraph`, in which each 

1332 value is a tuple of: 

1333 

1334 - a `dict` of `TaskNode` instances, keyed by task label 

1335 - a `dict` of `DatasetTypeNode` instances, keyed by 

1336 dataset type name. 

1337 

1338 that have those dimensions. 

1339 

1340 Notes 

1341 ----- 

1342 Init inputs and outputs are always included, but always have empty 

1343 dimensions and are hence are all grouped together. 

1344 """ 

1345 result: dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {} 

1346 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {}) 

1347 for task_label, task_node in self.tasks.items(): 

1348 if task_node.dimensions is None: 

1349 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.") 

1350 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value: 

1351 next_new_value = ({}, {}) # make new lists for next time 

1352 group[0][task_node.label] = task_node 

1353 for dataset_type_name, dataset_type_node in self.dataset_types.items(): 

1354 if dataset_type_node is None: 

1355 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.") 

1356 if not dataset_type_node.is_prerequisite or prerequisites: 

1357 if ( 

1358 group := result.setdefault(dataset_type_node.dataset_type.dimensions, next_new_value) 

1359 ) is next_new_value: 

1360 next_new_value = ({}, {}) # make new lists for next time 

1361 group[1][dataset_type_node.name] = dataset_type_node 

1362 return result 

1363 

1364 def split_independent(self) -> Iterable[PipelineGraph]: 

1365 """Iterate over independent subgraphs that together comprise this 

1366 pipeline graph. 

1367 

1368 Returns 

1369 ------- 

1370 subgraphs : `Iterable` [ `PipelineGraph` ] 

1371 An iterable over component subgraphs that could be run 

1372 independently (they have only overall inputs in common). May be a 

1373 lazy iterator. 

1374 

1375 Notes 

1376 ----- 

1377 All resolved dataset type nodes will be preserved. 

1378 

1379 If there is only one component, ``self`` may be returned as the only 

1380 element in the iterable. 

1381 

1382 If `has_been_sorted`, all subgraphs will be sorted as well. 

1383 """ 

1384 # Having an overall input in common isn't enough to make subgraphs 

1385 # dependent on each other, so we want to look for connected component 

1386 # subgraphs of the task-only projected graph. 

1387 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False) 

1388 task_keys = { 

1389 key 

1390 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1391 if bipartite == NodeType.TASK.bipartite 

1392 } 

1393 task_xgraph = networkx.algorithms.bipartite.projected_graph( 

1394 networkx.DiGraph(bipartite_xgraph), task_keys 

1395 ) 

1396 # "Weakly" connected means connected in only one direction, which is 

1397 # the only kind of "connected" a DAG can ever be. 

1398 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph): 

1399 if component_task_keys == task_keys: 

1400 yield self 

1401 return 

1402 else: 

1403 component_subgraph = PipelineGraph(universe=self._universe) 

1404 component_subgraph.add_task_nodes( 

1405 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self 

1406 ) 

1407 if self.has_been_sorted: 

1408 component_subgraph.sort() 

1409 yield component_subgraph 

1410 

1411 ########################################################################### 

1412 # 

1413 # Class- and Package-Private Methods. 

1414 # 

1415 ########################################################################### 

1416 

1417 def _iter_task_defs(self) -> Iterator[TaskDef]: 

1418 """Iterate over this pipeline as a sequence of `TaskDef` instances. 

1419 

1420 Notes 

1421 ----- 

1422 This is a package-private method intended to aid in the transition to a 

1423 codebase more fully integrated with the `PipelineGraph` class, in which 

1424 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and 

1425 much of the functionality on the `Pipeline` class will be moved to 

1426 `PipelineGraph` as well. 

1427 

1428 Raises 

1429 ------ 

1430 TaskNotImportedError 

1431 Raised if `TaskNode.is_imported` is `False` for any task. 

1432 """ 

1433 from ..pipeline import TaskDef 

1434 

1435 for node in self._tasks.values(): 

1436 yield TaskDef( 

1437 config=node.config, 

1438 taskClass=node.task_class, 

1439 label=node.label, 

1440 connections=node._get_imported_data().connections, 

1441 ) 

1442 

1443 def _init_from_args( 

1444 self, 

1445 xgraph: networkx.MultiDiGraph | None, 

1446 sorted_keys: Sequence[NodeKey] | None, 

1447 task_subsets: dict[str, TaskSubset] | None, 

1448 description: str, 

1449 universe: DimensionUniverse | None, 

1450 data_id: DataId | None, 

1451 ) -> None: 

1452 """Initialize the graph with possibly-nontrivial arguments. 

1453 

1454 Parameters 

1455 ---------- 

1456 xgraph : `networkx.MultiDiGraph` or `None` 

1457 The backing networkx graph, or `None` to create an empty one. 

1458 This graph has `NodeKey` instances for nodes and the same structure 

1459 as the graph exported by `make_xgraph`, but its nodes and edges 

1460 have a single ``instance`` attribute that holds a `TaskNode`, 

1461 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or 

1462 `WriteEdge` instance. 

1463 sorted_keys : `Sequence` [ `NodeKey` ] or `None` 

1464 Topologically sorted sequence of node keys, or `None` if the graph 

1465 is not sorted. 

1466 task_subsets : `dict` [ `str`, `TaskSubset` ] 

1467 Labeled subsets of tasks. Values must be constructed with 

1468 ``xgraph`` as their parent graph. 

1469 description : `str` 

1470 String description for this pipeline. 

1471 universe : `lsst.daf.butler.DimensionUniverse` or `None` 

1472 Definitions of all dimensions. 

1473 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping. 

1474 Data ID that represents a constraint on all quanta generated from 

1475 this pipeline. 

1476 

1477 Notes 

1478 ----- 

1479 Only empty `PipelineGraph` instances should be constructed directly by 

1480 users, which sets the signature of ``__init__`` itself, but methods on 

1481 `PipelineGraph` and its helper classes need to be able to create them 

1482 with state. Those methods can call this after calling ``__new__`` 

1483 manually, skipping ``__init__``. 

1484 """ 

1485 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph() 

1486 self._sorted_keys: Sequence[NodeKey] | None = None 

1487 self._task_subsets = task_subsets if task_subsets is not None else {} 

1488 self._description = description 

1489 self._tasks = TaskMappingView(self._xgraph) 

1490 self._dataset_types = DatasetTypeMappingView(self._xgraph) 

1491 self._raw_data_id: dict[str, Any] 

1492 if isinstance(data_id, DataCoordinate): 

1493 if universe is None: 

1494 universe = data_id.universe 

1495 else: 

1496 assert universe is data_id.universe, "data_id.universe and given universe differ" 

1497 self._raw_data_id = data_id.byName() 

1498 elif data_id is None: 

1499 self._raw_data_id = {} 

1500 else: 

1501 self._raw_data_id = dict(data_id) 

1502 self._universe = universe 

1503 if sorted_keys is not None: 

1504 self._reorder(sorted_keys) 

1505 

1506 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph: 

1507 """Make a bipartite init-only or runtime-only internal subgraph. 

1508 

1509 See `make_bipartite_xgraph` for parameters and return values. 

1510 

1511 Notes 

1512 ----- 

1513 This method returns a view of the `PipelineGraph` object's internal 

1514 backing graph, and hence should only be called in methods that copy the 

1515 result either explicitly or by running a copying algorithm before 

1516 returning it to the user. 

1517 """ 

1518 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)]) 

1519 

1520 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G: 

1521 """Transform networkx graph attributes in-place from the internal 

1522 "instance" attributes to the documented exported attributes. 

1523 

1524 Parameters 

1525 ---------- 

1526 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1527 Graph whose state should be transformed. 

1528 skip_edges : `bool` 

1529 If `True`, do not transform edge state. 

1530 

1531 Returns 

1532 ------- 

1533 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1534 The same object passed in, after modification. 

1535 

1536 Notes 

1537 ----- 

1538 This should be called after making a copy of the internal graph but 

1539 before any projection down to just task or dataset type nodes, since 

1540 it assumes stateful edges. 

1541 """ 

1542 state: dict[str, Any] 

1543 for state in xgraph.nodes.values(): 

1544 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance") 

1545 if node_value is not None: 

1546 state.update(node_value._to_xgraph_state()) 

1547 if not skip_edges: 

1548 for _, _, state in xgraph.edges(data=True): 

1549 edge: Edge | None = state.pop("instance", None) 

1550 if edge is not None: 

1551 state.update(edge._to_xgraph_state()) 

1552 return xgraph 

1553 

1554 def _replace_task_nodes( 

1555 self, 

1556 updates: Mapping[str, TaskNode], 

1557 check_edges_unchanged: bool, 

1558 assume_edges_unchanged: bool, 

1559 message_header: str, 

1560 ) -> None: 

1561 """Replace task nodes and update edges and dataset type nodes 

1562 accordingly. 

1563 

1564 Parameters 

1565 ---------- 

1566 updates : `Mapping` [ `str`, `TaskNode` ] 

1567 New task nodes with task label keys. All keys must be task labels 

1568 that are already present in the graph. 

1569 check_edges_unchanged : `bool`, optional 

1570 If `True`, require the edges (connections) of the modified tasks to 

1571 remain unchanged after importing and configuring each task, and 

1572 verify that this is the case. 

1573 assume_edges_unchanged : `bool`, optional 

1574 If `True`, the caller declares that the edges (connections) of the 

1575 modified tasks will remain unchanged importing and configuring each 

1576 task, and that it is unnecessary to check this. 

1577 message_header : `str` 

1578 Template for `str.format` with a single ``task_label`` placeholder 

1579 to use as the first line in `EdgesChangedError` messages that show 

1580 the differences between new task edges and old task edges. Should 

1581 include the fact that the rest of the message will refer to the old 

1582 task as "A" and the new task as "B", and end with a colon. 

1583 

1584 Raises 

1585 ------ 

1586 ValueError 

1587 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

1588 are both `True`, or if a full config is provided for a task after 

1589 another full config or an override has already been provided. 

1590 EdgesChangedError 

1591 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

1592 change. 

1593 """ 

1594 deep: dict[str, TaskNode] = {} 

1595 shallow: dict[str, TaskNode] = {} 

1596 if assume_edges_unchanged: 

1597 if check_edges_unchanged: 

1598 raise ValueError("Cannot simultaneously assume and check that edges have not changed.") 

1599 shallow.update(updates) 

1600 else: 

1601 for task_label, new_task_node in updates.items(): 

1602 old_task_node = self.tasks[task_label] 

1603 messages = old_task_node.diff_edges(new_task_node) 

1604 if messages: 

1605 if check_edges_unchanged: 

1606 messages.insert(0, message_header.format(task_label=task_label)) 

1607 raise EdgesChangedError("\n".join(messages)) 

1608 else: 

1609 deep[task_label] = new_task_node 

1610 else: 

1611 shallow[task_label] = new_task_node 

1612 try: 

1613 if deep: 

1614 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True) 

1615 self.add_task_nodes(deep.values()) 

1616 for replaced_task_node, referencing_subsets in removed: 

1617 for subset_label in referencing_subsets: 

1618 self._task_subsets[subset_label].add(replaced_task_node.label) 

1619 for task_node in shallow.values(): 

1620 self._xgraph.nodes[task_node.key]["instance"] = task_node 

1621 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init 

1622 except PipelineGraphExceptionSafetyError: # pragma: no cover 

1623 raise 

1624 except Exception as err: # pragma: no cover 

1625 # There's no known way to get here, but we want to make it clear 

1626 # it's a big problem if we do. 

1627 raise PipelineGraphExceptionSafetyError( 

1628 "Error while replacing tasks has left the graph in an inconsistent state." 

1629 ) from err 

1630 

1631 def _append_graph_data_from_edge( 

1632 self, 

1633 node_data: list[tuple[NodeKey, dict[str, Any]]], 

1634 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]], 

1635 edge: Edge, 

1636 parent: PipelineGraph | None, 

1637 ) -> None: 

1638 """Append networkx state dictionaries for an edge and the corresponding 

1639 dataset type node. 

1640 

1641 Parameters 

1642 ---------- 

1643 node_data : `list` 

1644 List of node keys and state dictionaries. A node is appended if 

1645 one does not already exist for this dataset type. 

1646 edge_data : `list` 

1647 List of node key pairs, connection names, and state dictionaries 

1648 for edges. 

1649 edge : `Edge` 

1650 New edge being processed. 

1651 parent : `PipelineGraph` or `None` 

1652 Another pipeline graph whose dataset type nodes should be used 

1653 when present. 

1654 """ 

1655 new_dataset_type_node = None 

1656 if parent is not None: 

1657 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance") 

1658 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None: 

1659 existing_dataset_type_state["instance"] = new_dataset_type_node 

1660 else: 

1661 node_data.append( 

1662 ( 

1663 edge.dataset_type_key, 

1664 { 

1665 "instance": new_dataset_type_node, 

1666 "bipartite": NodeType.DATASET_TYPE.bipartite, 

1667 }, 

1668 ) 

1669 ) 

1670 edge_data.append( 

1671 edge.nodes 

1672 + ( 

1673 edge.connection_name, 

1674 {"instance": edge}, 

1675 ) 

1676 ) 

1677 

1678 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None: 

1679 """Set the order of all views of this graph from the given sorted 

1680 sequence of task labels and dataset type names. 

1681 """ 

1682 self._sorted_keys = sorted_keys 

1683 self._tasks._reorder(sorted_keys) 

1684 self._dataset_types._reorder(sorted_keys) 

1685 

1686 def _reset(self) -> None: 

1687 """Reset the all views of this graph following a modification that 

1688 might invalidate them. 

1689 """ 

1690 self._sorted_keys = None 

1691 self._tasks._reset() 

1692 self._dataset_types._reset() 

1693 

1694 _xgraph: networkx.MultiDiGraph 

1695 _sorted_keys: Sequence[NodeKey] | None 

1696 _task_subsets: dict[str, TaskSubset] 

1697 _description: str 

1698 _tasks: TaskMappingView 

1699 _dataset_types: DatasetTypeMappingView 

1700 _raw_data_id: dict[str, Any] 

1701 _universe: DimensionUniverse | None