Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 20%

377 statements  

« prev     ^ index     » next       coverage.py v7.4.2, created at 2024-02-21 10:57 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("PipelineGraph",) 

30 

31import gzip 

32import itertools 

33import json 

34from collections.abc import Iterable, Iterator, Mapping, Sequence 

35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast 

36 

37import networkx 

38import networkx.algorithms.bipartite 

39import networkx.algorithms.dag 

40from lsst.daf.butler import DataCoordinate, DataId, DimensionGroup, DimensionUniverse, Registry 

41from lsst.resources import ResourcePath, ResourcePathExpression 

42 

43from ._dataset_types import DatasetTypeNode 

44from ._edges import Edge, ReadEdge, WriteEdge 

45from ._exceptions import ( 

46 DuplicateOutputError, 

47 EdgesChangedError, 

48 PipelineDataCycleError, 

49 PipelineGraphError, 

50 PipelineGraphExceptionSafetyError, 

51 UnresolvedGraphError, 

52) 

53from ._mapping_views import DatasetTypeMappingView, TaskMappingView 

54from ._nodes import NodeKey, NodeType 

55from ._task_subsets import TaskSubset 

56from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData 

57 

58if TYPE_CHECKING: 

59 from ..config import PipelineTaskConfig 

60 from ..connections import PipelineTaskConnections 

61 from ..pipeline import TaskDef 

62 from ..pipelineTask import PipelineTask 

63 

64 

65_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph) 

66 

67 

68class PipelineGraph: 

69 """A graph representation of fully-configured pipeline. 

70 

71 `PipelineGraph` instances are typically constructed by calling 

72 `.Pipeline.to_graph`, but in rare cases constructing and then populating an 

73 empty one may be preferable. 

74 

75 Parameters 

76 ---------- 

77 description : `str`, optional 

78 String description for this pipeline. 

79 universe : `lsst.daf.butler.DimensionUniverse`, optional 

80 Definitions for all butler dimensions. If not provided, some 

81 attributes will not be available until `resolve` is called. 

82 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional 

83 Data ID that represents a constraint on all quanta generated by this 

84 pipeline. This typically just holds the instrument constraint included 

85 in the pipeline definition, if there was one. 

86 """ 

87 

88 ########################################################################### 

89 # 

90 # Simple Pipeline Graph Inspection Interface: 

91 # 

92 # - for inspecting graph structure, not modifying it (except to sort and] 

93 # resolve); 

94 # 

95 # - no NodeKey objects, just string dataset type name and task label keys; 

96 # 

97 # - graph structure is represented as a pair of mappings, with methods to 

98 # find neighbors and edges of nodes. 

99 # 

100 ########################################################################### 

101 

102 def __init__( 

103 self, 

104 *, 

105 description: str = "", 

106 universe: DimensionUniverse | None = None, 

107 data_id: DataId | None = None, 

108 ) -> None: 

109 self._init_from_args( 

110 xgraph=None, 

111 sorted_keys=None, 

112 task_subsets=None, 

113 description=description, 

114 universe=universe, 

115 data_id=data_id, 

116 ) 

117 

118 def __repr__(self) -> str: 

119 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})" 

120 

121 @property 

122 def description(self) -> str: 

123 """String description for this pipeline.""" 

124 return self._description 

125 

126 @description.setter 

127 def description(self, value: str) -> None: 

128 # Docstring in setter. 

129 self._description = value 

130 

131 @property 

132 def universe(self) -> DimensionUniverse | None: 

133 """Definitions for all butler dimensions.""" 

134 return self._universe 

135 

136 @property 

137 def data_id(self) -> DataCoordinate: 

138 """Data ID that represents a constraint on all quanta generated from 

139 this pipeline. 

140 

141 This is may not be available unless `universe` is not `None`. 

142 """ 

143 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe) 

144 

145 @property 

146 def tasks(self) -> TaskMappingView: 

147 """A mapping view of the tasks in the graph. 

148 

149 This mapping has `str` task label keys and `TaskNode` values. Iteration 

150 is topologically and deterministically ordered if and only if `sort` 

151 has been called since the last modification to the graph. 

152 """ 

153 return self._tasks 

154 

155 @property 

156 def dataset_types(self) -> DatasetTypeMappingView: 

157 """A mapping view of the dataset types in the graph. 

158 

159 This mapping has `str` parent dataset type name keys, but only provides 

160 access to its `DatasetTypeNode` values if `resolve` has been called 

161 since the last modification involving a task that uses a dataset type. 

162 See `DatasetTypeMappingView` for details. 

163 """ 

164 return self._dataset_types 

165 

166 @property 

167 def task_subsets(self) -> Mapping[str, TaskSubset]: 

168 """A mapping of all labeled subsets of tasks. 

169 

170 Keys are subset labels, values are sets of task labels. See 

171 `TaskSubset` for more information. 

172 

173 Use `add_task_subset` to add a new subset. The subsets themselves may 

174 be modified in-place. 

175 """ 

176 return self._task_subsets 

177 

178 @property 

179 def is_fully_resolved(self) -> bool: 

180 """Whether all of this graph's nodes are resolved.""" 

181 return self._universe is not None and all( 

182 self.dataset_types.is_resolved(k) for k in self.dataset_types 

183 ) 

184 

185 @property 

186 def is_sorted(self) -> bool: 

187 """Whether this graph's tasks and dataset types are topologically 

188 sorted with the exact same deterministic tiebreakers that `sort` would 

189 apply. 

190 

191 This may perform (and then discard) a full sort if `has_been_sorted` is 

192 `False`. If the goal is to obtain a sorted graph, it is better to just 

193 call `sort` without guarding that with an ``if not graph.is_sorted`` 

194 check. 

195 """ 

196 if self._sorted_keys is not None: 

197 return True 

198 return all( 

199 sorted == unsorted 

200 for sorted, unsorted in zip( 

201 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True 

202 ) 

203 ) 

204 

205 @property 

206 def has_been_sorted(self) -> bool: 

207 """Whether this graph's tasks and dataset types have been 

208 topologically sorted (with unspecified but deterministic tiebreakers) 

209 since the last modification to the graph. 

210 

211 This may return `False` if the graph *happens* to be sorted but `sort` 

212 was never called, but it is potentially much faster than `is_sorted`, 

213 which may attempt (and then discard) a full sort if `has_been_sorted` 

214 is `False`. 

215 """ 

216 return self._sorted_keys is not None 

217 

218 def sort(self) -> None: 

219 """Sort this graph's nodes topologically with deterministic (but 

220 unspecified) tiebreakers. 

221 

222 This does nothing if the graph is already known to be sorted. 

223 """ 

224 if self._sorted_keys is None: 

225 try: 

226 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph)) 

227 except networkx.NetworkXUnfeasible as err: # pragma: no cover 

228 # Should't be possible to get here, because we check for cycles 

229 # when adding tasks, but we guard against it anyway. 

230 cycle = networkx.find_cycle(self._xgraph) 

231 raise PipelineDataCycleError( 

232 f"Cycle detected while attempting to sort graph: {cycle}." 

233 ) from err 

234 self._reorder(sorted_keys) 

235 

236 def copy(self) -> PipelineGraph: 

237 """Return a copy of this graph that copies all mutable state.""" 

238 xgraph = self._xgraph.copy() 

239 result = PipelineGraph.__new__(PipelineGraph) 

240 result._init_from_args( 

241 xgraph, 

242 self._sorted_keys, 

243 task_subsets={ 

244 k: TaskSubset(xgraph, v.label, set(v._members), v.description) 

245 for k, v in self._task_subsets.items() 

246 }, 

247 description=self._description, 

248 universe=self.universe, 

249 data_id=self._raw_data_id, 

250 ) 

251 return result 

252 

253 def __copy__(self) -> PipelineGraph: 

254 # Fully shallow copies are dangerous; we don't want shared mutable 

255 # state to lead to broken class invariants. 

256 return self.copy() 

257 

258 def __deepcopy__(self, memo: dict) -> PipelineGraph: 

259 # Genuine deep copies are unnecessary, since we should only ever care 

260 # that mutable state is copied. 

261 return self.copy() 

262 

263 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None: 

264 """Return the `WriteEdge` that links the producing task to the named 

265 dataset type. 

266 

267 Parameters 

268 ---------- 

269 dataset_type_name : `str` 

270 Dataset type name. Must not be a component. 

271 

272 Returns 

273 ------- 

274 edge : `WriteEdge` or `None` 

275 Producing edge or `None` if there isn't one in this graph. 

276 

277 Raises 

278 ------ 

279 DuplicateOutputError 

280 Raised if there are multiple tasks defined to produce this dataset 

281 type. This is only possible if the graph's dataset types are not 

282 resolved. 

283 

284 Notes 

285 ----- 

286 On resolved graphs, it may be slightly more efficient to use:: 

287 

288 graph.dataset_types[dataset_type_name].producing_edge 

289 

290 but this method works on graphs with unresolved dataset types as well. 

291 """ 

292 producer: str | None = None 

293 producing_edge: WriteEdge | None = None 

294 for _, _, producing_edge in self._xgraph.in_edges( 

295 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

296 ): 

297 assert producing_edge is not None, "Should only be None if we never loop." 

298 if producer is not None: 

299 raise DuplicateOutputError( 

300 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} " 

301 f"and {producer!r}." 

302 ) 

303 return producing_edge 

304 

305 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]: 

306 """Return the `ReadEdge` objects that link the named dataset type to 

307 the tasks that consume it. 

308 

309 Parameters 

310 ---------- 

311 dataset_type_name : `str` 

312 Dataset type name. Must not be a component. 

313 

314 Returns 

315 ------- 

316 edges : `list` [ `ReadEdge` ] 

317 Edges that connect this dataset type to the tasks that consume it. 

318 

319 Notes 

320 ----- 

321 On resolved graphs, it may be slightly more efficient to use:: 

322 

323 graph.dataset_types[dataset_type_name].producing_edges 

324 

325 but this method works on graphs with unresolved dataset types as well. 

326 """ 

327 return [ 

328 edge 

329 for _, _, edge in self._xgraph.out_edges( 

330 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

331 ) 

332 ] 

333 

334 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None: 

335 """Return the `TaskNode` or `TaskInitNode` that writes the given 

336 dataset type. 

337 

338 Parameters 

339 ---------- 

340 dataset_type_name : `str` 

341 Dataset type name. Must not be a component. 

342 

343 Returns 

344 ------- 

345 edge : `TaskNode`, `TaskInitNode`, or `None` 

346 Producing node or `None` if there isn't one in this graph. 

347 

348 Raises 

349 ------ 

350 DuplicateOutputError 

351 Raised if there are multiple tasks defined to produce this dataset 

352 type. This is only possible if the graph's dataset types are not 

353 resolved. 

354 """ 

355 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None: 

356 return self._xgraph.nodes[producing_edge.task_key]["instance"] 

357 return None 

358 

359 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]: 

360 """Return the `TaskNode` and/or `TaskInitNode` objects that read 

361 the given dataset type. 

362 

363 Parameters 

364 ---------- 

365 dataset_type_name : `str` 

366 Dataset type name. Must not be a component. 

367 

368 Returns 

369 ------- 

370 edges : `list` [ `ReadEdge` ] 

371 Edges that connect this dataset type to the tasks that consume it. 

372 

373 Notes 

374 ----- 

375 On resolved graphs, it may be slightly more efficient to use:: 

376 

377 graph.dataset_types[dataset_type_name].producing_edges 

378 

379 but this method works on graphs with unresolved dataset types as well. 

380 """ 

381 return [ 

382 self._xgraph.nodes[consuming_edge.task_key]["instance"] 

383 for consuming_edge in self.consuming_edges_of(dataset_type_name) 

384 ] 

385 

386 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]: 

387 """Return the dataset types that are inputs to a task. 

388 

389 Parameters 

390 ---------- 

391 task_label : `str` 

392 Label for the task in the pipeline. 

393 init : `bool`, optional 

394 If `True`, return init-input dataset types instead of runtime 

395 (including prerequisite) inputs. 

396 

397 Returns 

398 ------- 

399 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

400 Dictionary parent dataset type name keys and either 

401 `DatasetTypeNode` values (if the dataset type has been resolved) 

402 or `None` values. 

403 

404 Notes 

405 ----- 

406 To get the input edges of a task or task init node (which provide 

407 information about storage class overrides nd components) use:: 

408 

409 graph.tasks[task_label].iter_all_inputs() 

410 

411 or 

412 

413 graph.tasks[task_label].init.iter_all_inputs() 

414 

415 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

416 class. 

417 """ 

418 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

419 return { 

420 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

421 for edge in node.iter_all_inputs() 

422 } 

423 

424 def outputs_of( 

425 self, task_label: str, init: bool = False, include_automatic_connections: bool = True 

426 ) -> dict[str, DatasetTypeNode | None]: 

427 """Return the dataset types that are outputs of a task. 

428 

429 Parameters 

430 ---------- 

431 task_label : `str` 

432 Label for the task in the pipeline. 

433 init : `bool`, optional 

434 If `True`, return init-output dataset types instead of runtime 

435 outputs. 

436 include_automatic_connections : `bool`, optional 

437 Whether to include automatic connections such as configs, metadata, 

438 and logs. 

439 

440 Returns 

441 ------- 

442 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

443 Dictionary parent dataset type name keys and either 

444 `DatasetTypeNode` values (if the dataset type has been resolved) 

445 or `None` values. 

446 

447 Notes 

448 ----- 

449 To get the input edges of a task or task init node (which provide 

450 information about storage class overrides nd components) use:: 

451 

452 graph.tasks[task_label].iter_all_outputs() 

453 

454 or 

455 

456 graph.tasks[task_label].init.iter_all_outputs() 

457 

458 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

459 class. 

460 """ 

461 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

462 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values() 

463 return { 

464 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

465 for edge in iterable 

466 } 

467 

468 def resolve(self, registry: Registry) -> None: 

469 """Resolve all dimensions and dataset types and check them for 

470 consistency. 

471 

472 Resolving a graph also causes it to be sorted. 

473 

474 Parameters 

475 ---------- 

476 registry : `lsst.daf.butler.Registry` 

477 Client for the data repository to resolve against. 

478 

479 Notes 

480 ----- 

481 The `universe` attribute is set to ``registry.dimensions`` and used to 

482 set all `TaskNode.dimensions` attributes. Dataset type nodes are 

483 resolved by first looking for a registry definition, then using the 

484 producing task's definition, then looking for consistency between all 

485 consuming task definitions. 

486 

487 Raises 

488 ------ 

489 ConnectionTypeConsistencyError 

490 Raised if a prerequisite input for one task appears as a different 

491 kind of connection in any other task. 

492 DuplicateOutputError 

493 Raised if multiple tasks have the same dataset type as an output. 

494 IncompatibleDatasetTypeError 

495 Raised if different tasks have different definitions of a dataset 

496 type. Different but compatible storage classes are permitted. 

497 MissingDatasetTypeError 

498 Raised if a dataset type definition is required to exist in the 

499 data repository but none was found. This should only occur for 

500 dataset types that are not produced by a task in the pipeline and 

501 are consumed with different storage classes or as components by 

502 tasks in the pipeline. 

503 EdgesChangedError 

504 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

505 change after import and reconfiguration. 

506 """ 

507 node_key: NodeKey 

508 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {} 

509 for node_key, node_state in self._xgraph.nodes.items(): 

510 match node_key.node_type: 

511 case NodeType.TASK: 

512 task_node: TaskNode = node_state["instance"] 

513 new_task_node = task_node._resolved(registry.dimensions) 

514 if new_task_node is not task_node: 

515 updates[node_key] = new_task_node 

516 case NodeType.DATASET_TYPE: 

517 dataset_type_node: DatasetTypeNode | None = node_state["instance"] 

518 new_dataset_type_node = DatasetTypeNode._from_edges( 

519 node_key, self._xgraph, registry, previous=dataset_type_node 

520 ) 

521 # Usage of `is`` here is intentional; `_from_edges` returns 

522 # `previous=dataset_type_node` if it can determine that it 

523 # doesn't need to change. 

524 if new_dataset_type_node is not dataset_type_node: 

525 updates[node_key] = new_dataset_type_node 

526 try: 

527 for node_key, node_value in updates.items(): 

528 self._xgraph.nodes[node_key]["instance"] = node_value 

529 except Exception as err: # pragma: no cover 

530 # There's no known way to get here, but we want to make it 

531 # clear it's a big problem if we do. 

532 raise PipelineGraphExceptionSafetyError( 

533 "Error during dataset type resolution has left the graph in an inconsistent state." 

534 ) from err 

535 self.sort() 

536 self._universe = registry.dimensions 

537 

538 ########################################################################### 

539 # 

540 # Graph Modification Interface: 

541 # 

542 # - methods to add, remove, and replace tasks; 

543 # 

544 # - methods to add and remove task subsets. 

545 # 

546 # These are all things that are usually done in a Pipeline before making a 

547 # graph at all, but there may be cases where we want to modify the graph 

548 # instead. (These are also the methods used to make a graph from a 

549 # Pipeline, or make a graph from another graph.) 

550 # 

551 ########################################################################### 

552 

553 def add_task( 

554 self, 

555 label: str, 

556 task_class: type[PipelineTask], 

557 config: PipelineTaskConfig, 

558 connections: PipelineTaskConnections | None = None, 

559 ) -> TaskNode: 

560 """Add a new task to the graph. 

561 

562 Parameters 

563 ---------- 

564 label : `str` 

565 Label for the task in the pipeline. 

566 task_class : `type` [ `PipelineTask` ] 

567 Class object for the task. 

568 config : `PipelineTaskConfig` 

569 Configuration for the task. 

570 connections : `PipelineTaskConnections`, optional 

571 Object that describes the dataset types used by the task. If not 

572 provided, one will be constructed from the given configuration. If 

573 provided, it is assumed that ``config`` has already been validated 

574 and frozen. 

575 

576 Returns 

577 ------- 

578 node : `TaskNode` 

579 The new task node added to the graph. 

580 

581 Raises 

582 ------ 

583 ValueError 

584 Raised if configuration validation failed when constructing 

585 ``connections``. 

586 PipelineDataCycleError 

587 Raised if the graph is cyclic after this addition. 

588 RuntimeError 

589 Raised if an unexpected exception (which will be chained) occurred 

590 at a stage that may have left the graph in an inconsistent state. 

591 Other exceptions should leave the graph unchanged. 

592 

593 Notes 

594 ----- 

595 Checks for dataset type consistency and multiple producers do not occur 

596 until `resolve` is called, since the resolution depends on both the 

597 state of the data repository and all contributing tasks. 

598 

599 Adding new tasks removes any existing resolutions of all dataset types 

600 it references and marks the graph as unsorted. It is most effiecient 

601 to add all tasks up front and only then resolve and/or sort the graph. 

602 """ 

603 task_node = TaskNode._from_imported_data( 

604 key=NodeKey(NodeType.TASK, label), 

605 init_key=NodeKey(NodeType.TASK_INIT, label), 

606 data=_TaskNodeImportedData.configure(label, task_class, config, connections), 

607 universe=self.universe, 

608 ) 

609 self.add_task_nodes([task_node]) 

610 return task_node 

611 

612 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None: 

613 """Add one or more existing task nodes to the graph. 

614 

615 Parameters 

616 ---------- 

617 nodes : `~collections.abc.Iterable` [ `TaskNode` ] 

618 Iterable of task nodes to add. If any tasks have resolved 

619 dimensions, they must have the same dimension universe as the rest 

620 of the graph. 

621 parent : `PipelineGraph`, optional 

622 If provided, another `PipelineGraph` from which these nodes were 

623 obtained. Any dataset type nodes already present in ``parent`` 

624 that are referenced by the given tasks will be used in this graph 

625 if they are not already present, preserving any dataset type 

626 resolutions present in the parent graph. Adding nodes from a 

627 parent graph after the graph has its own nodes (e.g. from 

628 `add_task`) or nodes from a third graph may result in invalid 

629 dataset type resolutions. It is safest to only use this argument 

630 when populating an empty graph for the first time. 

631 

632 Raises 

633 ------ 

634 PipelineDataCycleError 

635 Raised if the graph is cyclic after this addition. 

636 

637 Notes 

638 ----- 

639 Checks for dataset type consistency and multiple producers do not occur 

640 until `resolve` is called, since the resolution depends on both the 

641 state of the data repository and all contributing tasks. 

642 

643 Adding new tasks removes any existing resolutions of all dataset types 

644 it references (unless ``parent is not None`` and marks the graph as 

645 unsorted. It is most efficient to add all tasks up front and only then 

646 resolve and/or sort the graph. 

647 """ 

648 node_data: list[tuple[NodeKey, dict[str, Any]]] = [] 

649 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = [] 

650 for task_node in nodes: 

651 task_node = task_node._resolved(self._universe) 

652 node_data.append( 

653 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite}) 

654 ) 

655 node_data.append( 

656 ( 

657 task_node.init.key, 

658 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite}, 

659 ) 

660 ) 

661 # Convert the edge objects attached to the task node to networkx. 

662 for read_edge in task_node.init.iter_all_inputs(): 

663 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

664 for write_edge in task_node.init.iter_all_outputs(): 

665 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

666 for read_edge in task_node.iter_all_inputs(): 

667 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

668 for write_edge in task_node.iter_all_outputs(): 

669 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

670 # Add a special edge (with no Edge instance) that connects the 

671 # TaskInitNode to the runtime TaskNode. 

672 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None})) 

673 if not node_data and not edge_data: 

674 return 

675 # Checks and preparation complete; time to start the actual 

676 # modification, during which it's hard to provide strong exception 

677 # safety. Start by resetting the sort ordering, if there is one. 

678 self._reset() 

679 try: 

680 self._xgraph.add_nodes_from(node_data) 

681 self._xgraph.add_edges_from(edge_data) 

682 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph): 

683 cycle = networkx.find_cycle(self._xgraph) 

684 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.") 

685 except Exception: 

686 # First try to roll back our changes. 

687 try: 

688 self._xgraph.remove_edges_from(edge_data) 

689 self._xgraph.remove_nodes_from(key for key, _ in node_data) 

690 except Exception as err: # pragma: no cover 

691 # There's no known way to get here, but we want to make it 

692 # clear it's a big problem if we do. 

693 raise PipelineGraphExceptionSafetyError( 

694 "Error while attempting to revert PipelineGraph modification has left the graph in " 

695 "an inconsistent state." 

696 ) from err 

697 # Successfully rolled back; raise the original exception. 

698 raise 

699 

700 def reconfigure_tasks( 

701 self, 

702 *args: tuple[str, PipelineTaskConfig], 

703 check_edges_unchanged: bool = False, 

704 assume_edges_unchanged: bool = False, 

705 **kwargs: PipelineTaskConfig, 

706 ) -> None: 

707 """Update the configuration for one or more tasks. 

708 

709 Parameters 

710 ---------- 

711 *args : `tuple` [ `str`, `.PipelineTaskConfig` ] 

712 Positional arguments are each a 2-tuple of task label and new 

713 config object. Note that the same arguments may also be passed as 

714 ``**kwargs``, which is usually more readable, but task labels in 

715 ``*args`` are not required to be valid Python identifiers. 

716 check_edges_unchanged : `bool`, optional 

717 If `True`, require the edges (connections) of the modified tasks to 

718 remain unchanged after the configuration updates, and verify that 

719 this is the case. 

720 assume_edges_unchanged : `bool`, optional 

721 If `True`, the caller declares that the edges (connections) of the 

722 modified tasks will remain unchanged after the configuration 

723 updates, and that it is unnecessary to check this. 

724 **kwargs : `.PipelineTaskConfig` 

725 New config objects or overrides to apply to copies of the current 

726 config objects, with task labels as the keywords. 

727 

728 Raises 

729 ------ 

730 ValueError 

731 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

732 are both `True`, or if the same task appears twice. 

733 EdgesChangedError 

734 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

735 change. 

736 

737 Notes 

738 ----- 

739 If reconfiguring a task causes its edges to change, any dataset type 

740 nodes connected to that task (not just those whose edges have changed!) 

741 will be unresolved. 

742 """ 

743 new_configs: dict[str, PipelineTaskConfig] = {} 

744 for task_label, config_update in itertools.chain(args, kwargs.items()): 

745 if new_configs.setdefault(task_label, config_update) is not config_update: 

746 raise ValueError(f"Config for {task_label!r} provided more than once.") 

747 updates = { 

748 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged) 

749 for task_label, config in new_configs.items() 

750 } 

751 self._replace_task_nodes( 

752 updates, 

753 check_edges_unchanged=check_edges_unchanged, 

754 assume_edges_unchanged=assume_edges_unchanged, 

755 message_header=( 

756 "Unexpected change in edges for task {task_label!r} from original config (A) to " 

757 "new configs (B):" 

758 ), 

759 ) 

760 

761 def remove_tasks( 

762 self, labels: Iterable[str], drop_from_subsets: bool = True 

763 ) -> list[tuple[TaskNode, set[str]]]: 

764 """Remove one or more tasks from the graph. 

765 

766 Parameters 

767 ---------- 

768 labels : `~collections.abc.Iterable` [ `str` ] 

769 Iterable of the labels of the tasks to remove. 

770 drop_from_subsets : `bool`, optional 

771 If `True`, drop each removed task from any subset in which it 

772 currently appears. If `False`, raise `PipelineGraphError` if any 

773 such subsets exist. 

774 

775 Returns 

776 ------- 

777 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ] 

778 List of nodes removed and the labels of task subsets that 

779 referenced them. 

780 

781 Raises 

782 ------ 

783 PipelineGraphError 

784 Raised if ``drop_from_subsets`` is `False` and the task is still 

785 part of one or more subsets. 

786 

787 Notes 

788 ----- 

789 Removing a task will cause dataset nodes with no other referencing 

790 tasks to be removed. Any other dataset type nodes referenced by a 

791 removed task will be reset to an "unresolved" state. 

792 """ 

793 task_nodes_and_subsets = [] 

794 dataset_types: set[NodeKey] = set() 

795 nodes_to_remove = set() 

796 for label in labels: 

797 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"] 

798 # Find task subsets that reference this task. 

799 referencing_subsets = { 

800 subset_label 

801 for subset_label, task_subset in self.task_subsets.items() 

802 if label in task_subset 

803 } 

804 if not drop_from_subsets and referencing_subsets: 

805 raise PipelineGraphError( 

806 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}." 

807 ) 

808 task_nodes_and_subsets.append((task_node, referencing_subsets)) 

809 # Find dataset types referenced by this task. 

810 dataset_types.update(self._xgraph.predecessors(task_node.key)) 

811 dataset_types.update(self._xgraph.successors(task_node.key)) 

812 dataset_types.update(self._xgraph.predecessors(task_node.init.key)) 

813 dataset_types.update(self._xgraph.successors(task_node.init.key)) 

814 # Since there's an edge between the task and its init node, we'll 

815 # have added those two nodes here, too, and we don't want that. 

816 dataset_types.remove(task_node.init.key) 

817 dataset_types.remove(task_node.key) 

818 # Mark the task node and its init node for removal from the graph. 

819 nodes_to_remove.add(task_node.key) 

820 nodes_to_remove.add(task_node.init.key) 

821 # Process the referenced datasets to see which ones are orphaned and 

822 # need to be removed vs. just unresolved. 

823 nodes_to_unresolve = [] 

824 for dataset_type_key in dataset_types: 

825 related_tasks = set() 

826 related_tasks.update(self._xgraph.predecessors(dataset_type_key)) 

827 related_tasks.update(self._xgraph.successors(dataset_type_key)) 

828 related_tasks.difference_update(nodes_to_remove) 

829 if not related_tasks: 

830 nodes_to_remove.add(dataset_type_key) 

831 else: 

832 nodes_to_unresolve.append(dataset_type_key) 

833 # Checks and preparation complete; time to start the actual 

834 # modification, during which it's hard to provide strong exception 

835 # safety. Start by resetting the sort ordering. 

836 self._reset() 

837 try: 

838 for dataset_type_key in nodes_to_unresolve: 

839 self._xgraph.nodes[dataset_type_key]["instance"] = None 

840 for task_node, referencing_subsets in task_nodes_and_subsets: 

841 for subset_label in referencing_subsets: 

842 self._task_subsets[subset_label].remove(task_node.label) 

843 self._xgraph.remove_nodes_from(nodes_to_remove) 

844 except Exception as err: # pragma: no cover 

845 # There's no known way to get here, but we want to make it 

846 # clear it's a big problem if we do. 

847 raise PipelineGraphExceptionSafetyError( 

848 "Error during task removal has left the graph in an inconsistent state." 

849 ) from err 

850 return task_nodes_and_subsets 

851 

852 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None: 

853 """Add a label for a set of tasks that are already in the pipeline. 

854 

855 Parameters 

856 ---------- 

857 subset_label : `str` 

858 Label for this set of tasks. 

859 task_labels : `~collections.abc.Iterable` [ `str` ] 

860 Labels of the tasks to include in the set. All must already be 

861 included in the graph. 

862 description : `str`, optional 

863 String description to associate with this label. 

864 """ 

865 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description) 

866 self._task_subsets[subset_label] = subset 

867 

868 def remove_task_subset(self, subset_label: str) -> None: 

869 """Remove a labeled set of tasks. 

870 

871 Parameters 

872 ---------- 

873 subset_label : `str` 

874 Label for this set of tasks. 

875 """ 

876 del self._task_subsets[subset_label] 

877 

878 ########################################################################### 

879 # 

880 # NetworkX Export Interface: 

881 # 

882 # - methods to export the PipelineGraph's content (or various subsets 

883 # thereof) as NetworkX objects. 

884 # 

885 # These are particularly useful when writing tools to visualize the graph, 

886 # while providing options for which aspects of the graph (tasks, dataset 

887 # types, or both) to include, since all exported graphs have similar 

888 # attributes regardless of their structure. 

889 # 

890 ########################################################################### 

891 

892 def make_xgraph(self) -> networkx.MultiDiGraph: 

893 """Export a networkx representation of the full pipeline graph, 

894 including both init and runtime edges. 

895 

896 Returns 

897 ------- 

898 xgraph : `networkx.MultiDiGraph` 

899 Directed acyclic graph with parallel edges. 

900 

901 Notes 

902 ----- 

903 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

904 represent the same dataset type appearing in multiple connections for 

905 the same task, and are hence rare. The connection name is used as the 

906 edge key to disambiguate those parallel edges. 

907 

908 Almost all edges connect dataset type nodes to task or task init nodes 

909 or vice versa, but there is also a special edge that connects each task 

910 init node to its runtime node. The existence of these edges makes the 

911 graph not quite bipartite, though its init-only and runtime-only 

912 subgraphs are bipartite. 

913 

914 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

915 `WriteEdge` for the descriptive node and edge attributes added. 

916 """ 

917 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False) 

918 

919 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph: 

920 """Return a bipartite networkx representation of just the runtime or 

921 init-time pipeline graph. 

922 

923 Parameters 

924 ---------- 

925 init : `bool`, optional 

926 If `True` (`False` is default) return the graph of task 

927 initialization nodes and init input/output dataset types, instead 

928 of the graph of runtime task nodes and regular 

929 input/output/prerequisite dataset types. 

930 

931 Returns 

932 ------- 

933 xgraph : `networkx.MultiDiGraph` 

934 Directed acyclic graph with parallel edges. 

935 

936 Notes 

937 ----- 

938 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

939 represent the same dataset type appearing in multiple connections for 

940 the same task, and are hence rare. The connection name is used as the 

941 edge key to disambiguate those parallel edges. 

942 

943 This graph is bipartite because each dataset type node only has edges 

944 that connect it to a task [init] node, and vice versa. 

945 

946 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

947 `WriteEdge` for the descriptive node and edge attributes added. 

948 """ 

949 return self._transform_xgraph_state( 

950 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False 

951 ) 

952 

953 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph: 

954 """Return a networkx representation of just the tasks in the pipeline. 

955 

956 Parameters 

957 ---------- 

958 init : `bool`, optional 

959 If `True` (`False` is default) return the graph of task 

960 initialization nodes, instead of the graph of runtime task nodes. 

961 

962 Returns 

963 ------- 

964 xgraph : `networkx.DiGraph` 

965 Directed acyclic graph with no parallel edges. 

966 

967 Notes 

968 ----- 

969 The returned graph uses `NodeKey` instances for nodes. The dataset 

970 types that link these tasks are not represented at all; edges have no 

971 attributes, and there are no parallel edges. 

972 

973 See `TaskNode` and `TaskInitNode` for the descriptive node and 

974 attributes added. 

975 """ 

976 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

977 task_keys = [ 

978 key 

979 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

980 if bipartite == NodeType.TASK.bipartite 

981 ] 

982 return self._transform_xgraph_state( 

983 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys), 

984 skip_edges=True, 

985 ) 

986 

987 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph: 

988 """Return a networkx representation of just the dataset types in the 

989 pipeline. 

990 

991 Parameters 

992 ---------- 

993 init : `bool`, optional 

994 If `True` (`False` is default) return the graph of init input and 

995 output dataset types, instead of the graph of runtime (input, 

996 output, prerequisite input) dataset types. 

997 

998 Returns 

999 ------- 

1000 xgraph : `networkx.DiGraph` 

1001 Directed acyclic graph with no parallel edges. 

1002 

1003 Notes 

1004 ----- 

1005 The returned graph uses `NodeKey` instances for nodes. The tasks that 

1006 link these tasks are not represented at all; edges have no attributes, 

1007 and there are no parallel edges. 

1008 

1009 See `DatasetTypeNode` for the descriptive node and attributes added. 

1010 """ 

1011 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

1012 dataset_type_keys = [ 

1013 key 

1014 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1015 if bipartite == NodeType.DATASET_TYPE.bipartite 

1016 ] 

1017 return self._transform_xgraph_state( 

1018 networkx.algorithms.bipartite.projected_graph( 

1019 networkx.DiGraph(bipartite_xgraph), dataset_type_keys 

1020 ), 

1021 skip_edges=True, 

1022 ) 

1023 

1024 ########################################################################### 

1025 # 

1026 # Serialization Interface. 

1027 # 

1028 # Serialization of PipelineGraphs is currently experimental and may not be 

1029 # retained in the future. All serialization methods are 

1030 # underscore-prefixed to ensure nobody mistakes them for a stable interface 

1031 # (let a lone a stable file format). 

1032 # 

1033 ########################################################################### 

1034 

1035 @classmethod 

1036 def _read_stream( 

1037 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1038 ) -> PipelineGraph: 

1039 """Read a serialized `PipelineGraph` from a file-like object. 

1040 

1041 Parameters 

1042 ---------- 

1043 stream : `BinaryIO` 

1044 File-like object opened for binary reading, containing 

1045 gzip-compressed JSON. 

1046 import_mode : `TaskImportMode`, optional 

1047 Whether to import tasks, and how to reconcile any differences 

1048 between the imported task's connections and the those that were 

1049 persisted with the graph. Default is to check that they are the 

1050 same. 

1051 

1052 Returns 

1053 ------- 

1054 graph : `PipelineGraph` 

1055 Deserialized pipeline graph. 

1056 

1057 Raises 

1058 ------ 

1059 PipelineGraphReadError 

1060 Raised if the serialized `PipelineGraph` is not self-consistent. 

1061 EdgesChangedError 

1062 Raised if ``import_mode`` is 

1063 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1064 did change after import and reconfiguration. 

1065 

1066 Notes 

1067 ----- 

1068 `PipelineGraph` serialization is currently experimental and may be 

1069 removed or significantly changed in the future, with no deprecation 

1070 period. 

1071 """ 

1072 from .io import SerializedPipelineGraph 

1073 

1074 with gzip.open(stream, "rb") as uncompressed_stream: 

1075 data = json.load(uncompressed_stream) 

1076 serialized_graph = SerializedPipelineGraph.model_validate(data) 

1077 return serialized_graph.deserialize(import_mode) 

1078 

1079 @classmethod 

1080 def _read_uri( 

1081 cls, 

1082 uri: ResourcePathExpression, 

1083 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES, 

1084 ) -> PipelineGraph: 

1085 """Read a serialized `PipelineGraph` from a file at a URI. 

1086 

1087 Parameters 

1088 ---------- 

1089 uri : convertible to `lsst.resources.ResourcePath` 

1090 URI to a gzip-compressed JSON file containing a serialized pipeline 

1091 graph. 

1092 import_mode : `TaskImportMode`, optional 

1093 Whether to import tasks, and how to reconcile any differences 

1094 between the imported task's connections and the those that were 

1095 persisted with the graph. Default is to check that they are the 

1096 same. 

1097 

1098 Returns 

1099 ------- 

1100 graph : `PipelineGraph` 

1101 Deserialized pipeline graph. 

1102 

1103 Raises 

1104 ------ 

1105 PipelineGraphReadError 

1106 Raised if the serialized `PipelineGraph` is not self-consistent. 

1107 EdgesChangedError 

1108 Raised if ``import_mode`` is 

1109 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1110 did change after import and reconfiguration. 

1111 

1112 Notes 

1113 ----- 

1114 `PipelineGraph` serialization is currently experimental and may be 

1115 removed or significantly changed in the future, with no deprecation 

1116 period. 

1117 """ 

1118 uri = ResourcePath(uri) 

1119 with uri.open("rb") as stream: 

1120 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode) 

1121 

1122 def _write_stream(self, stream: BinaryIO) -> None: 

1123 """Write the pipeline to a file-like object. 

1124 

1125 Parameters 

1126 ---------- 

1127 stream 

1128 File-like object opened for binary writing. 

1129 

1130 Notes 

1131 ----- 

1132 `PipelineGraph` serialization is currently experimental and may be 

1133 removed or significantly changed in the future, with no deprecation 

1134 period. 

1135 

1136 The file format is gzipped JSON, and is intended to be human-readable, 

1137 but it should not be considered a stable public interface for outside 

1138 code, which should always use `PipelineGraph` methods (or at least the 

1139 `io.SerializedPipelineGraph` class) to read these files. 

1140 """ 

1141 from .io import SerializedPipelineGraph 

1142 

1143 with gzip.open(stream, mode="wb") as compressed_stream: 

1144 compressed_stream.write( 

1145 SerializedPipelineGraph.serialize(self).model_dump_json(exclude_defaults=True).encode("utf-8") 

1146 ) 

1147 

1148 def _write_uri(self, uri: ResourcePathExpression) -> None: 

1149 """Write the pipeline to a file given a URI. 

1150 

1151 Parameters 

1152 ---------- 

1153 uri : convertible to `lsst.resources.ResourcePath` 

1154 URI to write to . May have ``.json.gz`` or no extension (which 

1155 will cause a ``.json.gz`` extension to be added). 

1156 

1157 Notes 

1158 ----- 

1159 `PipelineGraph` serialization is currently experimental and may be 

1160 removed or significantly changed in the future, with no deprecation 

1161 period. 

1162 

1163 The file format is gzipped JSON, and is intended to be human-readable, 

1164 but it should not be considered a stable public interface for outside 

1165 code, which should always use `PipelineGraph` methods (or at least the 

1166 `io.SerializedPipelineGraph` class) to read these files. 

1167 """ 

1168 uri = ResourcePath(uri) 

1169 extension = uri.getExtension() 

1170 if not extension: 

1171 uri = uri.updatedExtension(".json.gz") 

1172 elif extension != ".json.gz": 

1173 raise ValueError("Expanded pipeline files should always have a .json.gz extension.") 

1174 with uri.open(mode="wb") as stream: 

1175 self._write_stream(cast(BinaryIO, stream)) 

1176 

1177 def _import_and_configure( 

1178 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1179 ) -> None: 

1180 """Import the `PipelineTask` classes referenced by all task nodes and 

1181 update those nodes accordingly. 

1182 

1183 Parameters 

1184 ---------- 

1185 import_mode : `TaskImportMode`, optional 

1186 Whether to import tasks, and how to reconcile any differences 

1187 between the imported task's connections and the those that were 

1188 persisted with the graph. Default is to check that they are the 

1189 same. This method does nothing if this is 

1190 `TaskImportMode.DO_NOT_IMPORT`. 

1191 

1192 Raises 

1193 ------ 

1194 EdgesChangedError 

1195 Raised if ``import_mode`` is 

1196 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1197 did change after import and reconfiguration. 

1198 

1199 Notes 

1200 ----- 

1201 This method shouldn't need to be called unless the graph was 

1202 deserialized without importing and configuring immediately, which is 

1203 not the default behavior (but it can greatly speed up deserialization). 

1204 If all tasks have already been imported this does nothing. 

1205 

1206 Importing and configuring a task can change its 

1207 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output, 

1208 usually because the software used to read a serialized graph is newer 

1209 than the software used to write it (e.g. a new config option has been 

1210 added, or the task was moved to a new module with a forwarding alias 

1211 left behind). These changes are allowed by 

1212 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`. 

1213 

1214 If importing and configuring a task causes its edges to change, any 

1215 dataset type nodes linked to those edges will be reset to the 

1216 unresolved state. 

1217 """ 

1218 if import_mode is TaskImportMode.DO_NOT_IMPORT: 

1219 return 

1220 rebuild = ( 

1221 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1222 or import_mode is TaskImportMode.OVERRIDE_EDGES 

1223 ) 

1224 updates: dict[str, TaskNode] = {} 

1225 node_key: NodeKey 

1226 for node_key, node_state in self._xgraph.nodes.items(): 

1227 if node_key.node_type is NodeType.TASK: 

1228 task_node: TaskNode = node_state["instance"] 

1229 new_task_node = task_node._imported_and_configured(rebuild) 

1230 if new_task_node is not task_node: 

1231 updates[task_node.label] = new_task_node 

1232 self._replace_task_nodes( 

1233 updates, 

1234 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES), 

1235 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES), 

1236 message_header=( 

1237 "In task with label {task_label!r}, persisted edges (A)" 

1238 "differ from imported and configured edges (B):" 

1239 ), 

1240 ) 

1241 

1242 ########################################################################### 

1243 # 

1244 # Advanced PipelineGraph Inspection Interface: 

1245 # 

1246 # - methods to iterate over all nodes and edges, utilizing NodeKeys; 

1247 # 

1248 # - methods to find overall inputs and group nodes by their dimensions, 

1249 # which are important operations for QuantumGraph generation. 

1250 # 

1251 ########################################################################### 

1252 

1253 def iter_edges(self, init: bool = False) -> Iterator[Edge]: 

1254 """Iterate over edges in the graph. 

1255 

1256 Parameters 

1257 ---------- 

1258 init : `bool`, optional 

1259 If `True` (`False` is default) iterate over the edges between task 

1260 initialization node and init input/output dataset types, instead of 

1261 the runtime task nodes and regular input/output/prerequisite 

1262 dataset types. 

1263 

1264 Returns 

1265 ------- 

1266 edges : `~collections.abc.Iterator` [ `Edge` ] 

1267 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances. 

1268 

1269 Notes 

1270 ----- 

1271 This method always returns _either_ init edges or runtime edges, never 

1272 both. The full (internal) graph that contains both also includes a 

1273 special edge that connects each task init node to its runtime node; 

1274 that is also never returned by this method, since it is never a part of 

1275 the init-only or runtime-only subgraphs. 

1276 """ 

1277 edge: Edge 

1278 for _, _, edge in self._xgraph.edges(data="instance"): 

1279 if edge is not None and edge.is_init == init: 

1280 yield edge 

1281 

1282 def iter_nodes( 

1283 self, 

1284 ) -> Iterator[ 

1285 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode] 

1286 | tuple[Literal[NodeType.TASK], str, TaskInitNode] 

1287 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None] 

1288 ]: 

1289 """Iterate over nodes in the graph. 

1290 

1291 Returns 

1292 ------- 

1293 nodes : `~collections.abc.Iterator` [ `tuple` ] 

1294 A lazy iterator over all of the nodes in the graph. Each yielded 

1295 element is a tuple of: 

1296 

1297 - the node type enum value (`NodeType`); 

1298 - the string name for the node (task label or parent dataset type 

1299 name); 

1300 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`, 

1301 or `None` for dataset type nodes that have not been resolved). 

1302 """ 

1303 key: NodeKey 

1304 if self._sorted_keys is not None: 

1305 for key in self._sorted_keys: 

1306 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore 

1307 else: 

1308 for key, node in self._xgraph.nodes(data="instance"): 

1309 yield key.node_type, key.name, node # type: ignore 

1310 

1311 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]: 

1312 """Iterate over all of the dataset types that are consumed but not 

1313 produced by the graph. 

1314 

1315 Returns 

1316 ------- 

1317 dataset_types : `~collections.abc.Iterator` [ `tuple` ] 

1318 A lazy iterator over the overall-input dataset types (including 

1319 overall init inputs and prerequisites). Each yielded element is a 

1320 tuple of: 

1321 

1322 - the parent dataset type name; 

1323 - the resolved `DatasetTypeNode`, or `None` if the dataset type has 

1324 - not been resolved. 

1325 """ 

1326 for generation in networkx.algorithms.dag.topological_generations(self._xgraph): 

1327 key: NodeKey 

1328 for key in generation: 

1329 # While we expect all tasks to have at least one input and 

1330 # hence never appear in the first topological generation, that 

1331 # is not true of task init nodes. 

1332 if key.node_type is NodeType.DATASET_TYPE: 

1333 yield key.name, self._xgraph.nodes[key]["instance"] 

1334 return 

1335 

1336 def group_by_dimensions( 

1337 self, prerequisites: bool = False 

1338 ) -> dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]: 

1339 """Group this graph's tasks and dataset types by their dimensions. 

1340 

1341 Parameters 

1342 ---------- 

1343 prerequisites : `bool`, optional 

1344 If `True`, include prerequisite dataset types as well as regular 

1345 input and output datasets (including intermediates). 

1346 

1347 Returns 

1348 ------- 

1349 groups : `dict` [ `DimensionGroup`, `tuple` ] 

1350 A dictionary of groups keyed by `DimensionGroup`, in which each 

1351 value is a tuple of: 

1352 

1353 - a `dict` of `TaskNode` instances, keyed by task label 

1354 - a `dict` of `DatasetTypeNode` instances, keyed by 

1355 dataset type name. 

1356 

1357 that have those dimensions. 

1358 

1359 Notes 

1360 ----- 

1361 Init inputs and outputs are always included, but always have empty 

1362 dimensions and are hence are all grouped together. 

1363 """ 

1364 result: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {} 

1365 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {}) 

1366 for task_label, task_node in self.tasks.items(): 

1367 if task_node.dimensions is None: 

1368 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.") 

1369 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value: 

1370 next_new_value = ({}, {}) # make new lists for next time 

1371 group[0][task_node.label] = task_node 

1372 for dataset_type_name, dataset_type_node in self.dataset_types.items(): 

1373 if dataset_type_node is None: 

1374 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.") 

1375 if not dataset_type_node.is_prerequisite or prerequisites: 

1376 if ( 

1377 group := result.setdefault( 

1378 dataset_type_node.dataset_type.dimensions.as_group(), next_new_value 

1379 ) 

1380 ) is next_new_value: 

1381 next_new_value = ({}, {}) # make new lists for next time 

1382 group[1][dataset_type_node.name] = dataset_type_node 

1383 return result 

1384 

1385 def split_independent(self) -> Iterable[PipelineGraph]: 

1386 """Iterate over independent subgraphs that together comprise this 

1387 pipeline graph. 

1388 

1389 Returns 

1390 ------- 

1391 subgraphs : `Iterable` [ `PipelineGraph` ] 

1392 An iterable over component subgraphs that could be run 

1393 independently (they have only overall inputs in common). May be a 

1394 lazy iterator. 

1395 

1396 Notes 

1397 ----- 

1398 All resolved dataset type nodes will be preserved. 

1399 

1400 If there is only one component, ``self`` may be returned as the only 

1401 element in the iterable. 

1402 

1403 If `has_been_sorted`, all subgraphs will be sorted as well. 

1404 """ 

1405 # Having an overall input in common isn't enough to make subgraphs 

1406 # dependent on each other, so we want to look for connected component 

1407 # subgraphs of the task-only projected graph. 

1408 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False) 

1409 task_keys = { 

1410 key 

1411 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1412 if bipartite == NodeType.TASK.bipartite 

1413 } 

1414 task_xgraph = networkx.algorithms.bipartite.projected_graph( 

1415 networkx.DiGraph(bipartite_xgraph), task_keys 

1416 ) 

1417 # "Weakly" connected means connected in only one direction, which is 

1418 # the only kind of "connected" a DAG can ever be. 

1419 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph): 

1420 if component_task_keys == task_keys: 

1421 yield self 

1422 return 

1423 else: 

1424 component_subgraph = PipelineGraph(universe=self._universe) 

1425 component_subgraph.add_task_nodes( 

1426 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self 

1427 ) 

1428 if self.has_been_sorted: 

1429 component_subgraph.sort() 

1430 yield component_subgraph 

1431 

1432 ########################################################################### 

1433 # 

1434 # Class- and Package-Private Methods. 

1435 # 

1436 ########################################################################### 

1437 

1438 def _iter_task_defs(self) -> Iterator[TaskDef]: 

1439 """Iterate over this pipeline as a sequence of `TaskDef` instances. 

1440 

1441 Notes 

1442 ----- 

1443 This is a package-private method intended to aid in the transition to a 

1444 codebase more fully integrated with the `PipelineGraph` class, in which 

1445 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and 

1446 much of the functionality on the `Pipeline` class will be moved to 

1447 `PipelineGraph` as well. 

1448 

1449 Raises 

1450 ------ 

1451 TaskNotImportedError 

1452 Raised if `TaskNode.is_imported` is `False` for any task. 

1453 """ 

1454 from ..pipeline import TaskDef 

1455 

1456 for node in self._tasks.values(): 

1457 yield TaskDef( 

1458 config=node.config, 

1459 taskClass=node.task_class, 

1460 label=node.label, 

1461 connections=node._get_imported_data().connections, 

1462 ) 

1463 

1464 def _init_from_args( 

1465 self, 

1466 xgraph: networkx.MultiDiGraph | None, 

1467 sorted_keys: Sequence[NodeKey] | None, 

1468 task_subsets: dict[str, TaskSubset] | None, 

1469 description: str, 

1470 universe: DimensionUniverse | None, 

1471 data_id: DataId | None, 

1472 ) -> None: 

1473 """Initialize the graph with possibly-nontrivial arguments. 

1474 

1475 Parameters 

1476 ---------- 

1477 xgraph : `networkx.MultiDiGraph` or `None` 

1478 The backing networkx graph, or `None` to create an empty one. 

1479 This graph has `NodeKey` instances for nodes and the same structure 

1480 as the graph exported by `make_xgraph`, but its nodes and edges 

1481 have a single ``instance`` attribute that holds a `TaskNode`, 

1482 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or 

1483 `WriteEdge` instance. 

1484 sorted_keys : `Sequence` [ `NodeKey` ] or `None` 

1485 Topologically sorted sequence of node keys, or `None` if the graph 

1486 is not sorted. 

1487 task_subsets : `dict` [ `str`, `TaskSubset` ] 

1488 Labeled subsets of tasks. Values must be constructed with 

1489 ``xgraph`` as their parent graph. 

1490 description : `str` 

1491 String description for this pipeline. 

1492 universe : `lsst.daf.butler.DimensionUniverse` or `None` 

1493 Definitions of all dimensions. 

1494 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping. 

1495 Data ID that represents a constraint on all quanta generated from 

1496 this pipeline. 

1497 

1498 Notes 

1499 ----- 

1500 Only empty `PipelineGraph` instances should be constructed directly by 

1501 users, which sets the signature of ``__init__`` itself, but methods on 

1502 `PipelineGraph` and its helper classes need to be able to create them 

1503 with state. Those methods can call this after calling ``__new__`` 

1504 manually, skipping ``__init__``. 

1505 """ 

1506 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph() 

1507 self._sorted_keys: Sequence[NodeKey] | None = None 

1508 self._task_subsets = task_subsets if task_subsets is not None else {} 

1509 self._description = description 

1510 self._tasks = TaskMappingView(self._xgraph) 

1511 self._dataset_types = DatasetTypeMappingView(self._xgraph) 

1512 self._raw_data_id: dict[str, Any] 

1513 if isinstance(data_id, DataCoordinate): 

1514 if universe is None: 

1515 universe = data_id.universe 

1516 else: 

1517 assert universe is data_id.universe, "data_id.universe and given universe differ" 

1518 self._raw_data_id = dict(data_id.required) 

1519 elif data_id is None: 

1520 self._raw_data_id = {} 

1521 else: 

1522 self._raw_data_id = dict(data_id) 

1523 self._universe = universe 

1524 if sorted_keys is not None: 

1525 self._reorder(sorted_keys) 

1526 

1527 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph: 

1528 """Make a bipartite init-only or runtime-only internal subgraph. 

1529 

1530 See `make_bipartite_xgraph` for parameters and return values. 

1531 

1532 Notes 

1533 ----- 

1534 This method returns a view of the `PipelineGraph` object's internal 

1535 backing graph, and hence should only be called in methods that copy the 

1536 result either explicitly or by running a copying algorithm before 

1537 returning it to the user. 

1538 """ 

1539 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)]) 

1540 

1541 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G: 

1542 """Transform networkx graph attributes in-place from the internal 

1543 "instance" attributes to the documented exported attributes. 

1544 

1545 Parameters 

1546 ---------- 

1547 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1548 Graph whose state should be transformed. 

1549 skip_edges : `bool` 

1550 If `True`, do not transform edge state. 

1551 

1552 Returns 

1553 ------- 

1554 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1555 The same object passed in, after modification. 

1556 

1557 Notes 

1558 ----- 

1559 This should be called after making a copy of the internal graph but 

1560 before any projection down to just task or dataset type nodes, since 

1561 it assumes stateful edges. 

1562 """ 

1563 state: dict[str, Any] 

1564 for state in xgraph.nodes.values(): 

1565 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance") 

1566 if node_value is not None: 

1567 state.update(node_value._to_xgraph_state()) 

1568 else: 

1569 # This is a dataset type node that is not resolved. 

1570 state["bipartite"] = NodeType.DATASET_TYPE.bipartite 

1571 if not skip_edges: 

1572 for _, _, state in xgraph.edges(data=True): 

1573 edge: Edge | None = state.pop("instance", None) 

1574 if edge is not None: 

1575 state.update(edge._to_xgraph_state()) 

1576 return xgraph 

1577 

1578 def _replace_task_nodes( 

1579 self, 

1580 updates: Mapping[str, TaskNode], 

1581 check_edges_unchanged: bool, 

1582 assume_edges_unchanged: bool, 

1583 message_header: str, 

1584 ) -> None: 

1585 """Replace task nodes and update edges and dataset type nodes 

1586 accordingly. 

1587 

1588 Parameters 

1589 ---------- 

1590 updates : `Mapping` [ `str`, `TaskNode` ] 

1591 New task nodes with task label keys. All keys must be task labels 

1592 that are already present in the graph. 

1593 check_edges_unchanged : `bool`, optional 

1594 If `True`, require the edges (connections) of the modified tasks to 

1595 remain unchanged after importing and configuring each task, and 

1596 verify that this is the case. 

1597 assume_edges_unchanged : `bool`, optional 

1598 If `True`, the caller declares that the edges (connections) of the 

1599 modified tasks will remain unchanged importing and configuring each 

1600 task, and that it is unnecessary to check this. 

1601 message_header : `str` 

1602 Template for `str.format` with a single ``task_label`` placeholder 

1603 to use as the first line in `EdgesChangedError` messages that show 

1604 the differences between new task edges and old task edges. Should 

1605 include the fact that the rest of the message will refer to the old 

1606 task as "A" and the new task as "B", and end with a colon. 

1607 

1608 Raises 

1609 ------ 

1610 ValueError 

1611 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

1612 are both `True`, or if a full config is provided for a task after 

1613 another full config or an override has already been provided. 

1614 EdgesChangedError 

1615 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

1616 change. 

1617 """ 

1618 deep: dict[str, TaskNode] = {} 

1619 shallow: dict[str, TaskNode] = {} 

1620 if assume_edges_unchanged: 

1621 if check_edges_unchanged: 

1622 raise ValueError("Cannot simultaneously assume and check that edges have not changed.") 

1623 shallow.update(updates) 

1624 else: 

1625 for task_label, new_task_node in updates.items(): 

1626 old_task_node = self.tasks[task_label] 

1627 messages = old_task_node.diff_edges(new_task_node) 

1628 if messages: 

1629 if check_edges_unchanged: 

1630 messages.insert(0, message_header.format(task_label=task_label)) 

1631 raise EdgesChangedError("\n".join(messages)) 

1632 else: 

1633 deep[task_label] = new_task_node 

1634 else: 

1635 shallow[task_label] = new_task_node 

1636 try: 

1637 if deep: 

1638 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True) 

1639 self.add_task_nodes(deep.values()) 

1640 for replaced_task_node, referencing_subsets in removed: 

1641 for subset_label in referencing_subsets: 

1642 self._task_subsets[subset_label].add(replaced_task_node.label) 

1643 for task_node in shallow.values(): 

1644 self._xgraph.nodes[task_node.key]["instance"] = task_node 

1645 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init 

1646 except PipelineGraphExceptionSafetyError: # pragma: no cover 

1647 raise 

1648 except Exception as err: # pragma: no cover 

1649 # There's no known way to get here, but we want to make it clear 

1650 # it's a big problem if we do. 

1651 raise PipelineGraphExceptionSafetyError( 

1652 "Error while replacing tasks has left the graph in an inconsistent state." 

1653 ) from err 

1654 

1655 def _append_graph_data_from_edge( 

1656 self, 

1657 node_data: list[tuple[NodeKey, dict[str, Any]]], 

1658 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]], 

1659 edge: Edge, 

1660 parent: PipelineGraph | None, 

1661 ) -> None: 

1662 """Append networkx state dictionaries for an edge and the corresponding 

1663 dataset type node. 

1664 

1665 Parameters 

1666 ---------- 

1667 node_data : `list` 

1668 List of node keys and state dictionaries. A node is appended if 

1669 one does not already exist for this dataset type. 

1670 edge_data : `list` 

1671 List of node key pairs, connection names, and state dictionaries 

1672 for edges. 

1673 edge : `Edge` 

1674 New edge being processed. 

1675 parent : `PipelineGraph` or `None` 

1676 Another pipeline graph whose dataset type nodes should be used 

1677 when present. 

1678 """ 

1679 new_dataset_type_node = None 

1680 if parent is not None: 

1681 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance") 

1682 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None: 

1683 existing_dataset_type_state["instance"] = new_dataset_type_node 

1684 else: 

1685 node_data.append( 

1686 ( 

1687 edge.dataset_type_key, 

1688 { 

1689 "instance": new_dataset_type_node, 

1690 "bipartite": NodeType.DATASET_TYPE.bipartite, 

1691 }, 

1692 ) 

1693 ) 

1694 edge_data.append( 

1695 edge.nodes 

1696 + ( 

1697 edge.connection_name, 

1698 {"instance": edge}, 

1699 ) 

1700 ) 

1701 

1702 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None: 

1703 """Set the order of all views of this graph from the given sorted 

1704 sequence of task labels and dataset type names. 

1705 """ 

1706 self._sorted_keys = sorted_keys 

1707 self._tasks._reorder(sorted_keys) 

1708 self._dataset_types._reorder(sorted_keys) 

1709 

1710 def _reset(self) -> None: 

1711 """Reset the all views of this graph following a modification that 

1712 might invalidate them. 

1713 """ 

1714 self._sorted_keys = None 

1715 self._tasks._reset() 

1716 self._dataset_types._reset() 

1717 

1718 _xgraph: networkx.MultiDiGraph 

1719 _sorted_keys: Sequence[NodeKey] | None 

1720 _task_subsets: dict[str, TaskSubset] 

1721 _description: str 

1722 _tasks: TaskMappingView 

1723 _dataset_types: DatasetTypeMappingView 

1724 _raw_data_id: dict[str, Any] 

1725 _universe: DimensionUniverse | None