Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 20%

357 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-23 10:31 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("PipelineGraph",) 

24 

25import gzip 

26import itertools 

27import json 

28from collections.abc import Iterable, Iterator, Mapping, Sequence 

29from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast 

30 

31import networkx 

32import networkx.algorithms.bipartite 

33import networkx.algorithms.dag 

34from lsst.daf.butler import DataCoordinate, DataId, DimensionGraph, DimensionUniverse, Registry 

35from lsst.resources import ResourcePath, ResourcePathExpression 

36 

37from ._dataset_types import DatasetTypeNode 

38from ._edges import Edge, ReadEdge, WriteEdge 

39from ._exceptions import ( 

40 DuplicateOutputError, 

41 EdgesChangedError, 

42 PipelineDataCycleError, 

43 PipelineGraphError, 

44 PipelineGraphExceptionSafetyError, 

45 UnresolvedGraphError, 

46) 

47from ._mapping_views import DatasetTypeMappingView, TaskMappingView 

48from ._nodes import NodeKey, NodeType 

49from ._task_subsets import TaskSubset 

50from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData 

51 

52if TYPE_CHECKING: 

53 from ..config import PipelineTaskConfig 

54 from ..connections import PipelineTaskConnections 

55 from ..pipeline import TaskDef 

56 from ..pipelineTask import PipelineTask 

57 

58 

59_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph) 

60 

61 

62class PipelineGraph: 

63 """A graph representation of fully-configured pipeline. 

64 

65 `PipelineGraph` instances are typically constructed by calling 

66 `.Pipeline.to_graph`, but in rare cases constructing and then populating an 

67 empty one may be preferable. 

68 

69 Parameters 

70 ---------- 

71 description : `str`, optional 

72 String description for this pipeline. 

73 universe : `lsst.daf.butler.DimensionUniverse`, optional 

74 Definitions for all butler dimensions. If not provided, some 

75 attributes will not be available until `resolve` is called. 

76 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional 

77 Data ID that represents a constraint on all quanta generated by this 

78 pipeline. This typically just holds the instrument constraint included 

79 in the pipeline definition, if there was one. 

80 """ 

81 

82 ########################################################################### 

83 # 

84 # Simple Pipeline Graph Inspection Interface: 

85 # 

86 # - for inspecting graph structure, not modifying it (except to sort and] 

87 # resolve); 

88 # 

89 # - no NodeKey objects, just string dataset type name and task label keys; 

90 # 

91 # - graph structure is represented as a pair of mappings, with methods to 

92 # find neighbors and edges of nodes. 

93 # 

94 ########################################################################### 

95 

96 def __init__( 

97 self, 

98 *, 

99 description: str = "", 

100 universe: DimensionUniverse | None = None, 

101 data_id: DataId | None = None, 

102 ) -> None: 

103 self._init_from_args( 

104 xgraph=None, 

105 sorted_keys=None, 

106 task_subsets=None, 

107 description=description, 

108 universe=universe, 

109 data_id=data_id, 

110 ) 

111 

112 def __repr__(self) -> str: 

113 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})" 

114 

115 @property 

116 def description(self) -> str: 

117 """String description for this pipeline.""" 

118 return self._description 

119 

120 @description.setter 

121 def description(self, value: str) -> None: 

122 # Docstring in setter. 

123 self._description = value 

124 

125 @property 

126 def universe(self) -> DimensionUniverse | None: 

127 """Definitions for all butler dimensions.""" 

128 return self._universe 

129 

130 @property 

131 def data_id(self) -> DataCoordinate: 

132 """Data ID that represents a constraint on all quanta generated from 

133 this pipeline. 

134 

135 This is may not be available unless `universe` is not `None`. 

136 """ 

137 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe) 

138 

139 @property 

140 def tasks(self) -> TaskMappingView: 

141 """A mapping view of the tasks in the graph. 

142 

143 This mapping has `str` task label keys and `TaskNode` values. Iteration 

144 is topologically and deterministically ordered if and only if `sort` 

145 has been called since the last modification to the graph. 

146 """ 

147 return self._tasks 

148 

149 @property 

150 def dataset_types(self) -> DatasetTypeMappingView: 

151 """A mapping view of the dataset types in the graph. 

152 

153 This mapping has `str` parent dataset type name keys, but only provides 

154 access to its `DatasetTypeNode` values if `resolve` has been called 

155 since the last modification involving a task that uses a dataset type. 

156 See `DatasetTypeMappingView` for details. 

157 """ 

158 return self._dataset_types 

159 

160 @property 

161 def task_subsets(self) -> Mapping[str, TaskSubset]: 

162 """A mapping of all labeled subsets of tasks. 

163 

164 Keys are subset labels, values are sets of task labels. See 

165 `TaskSubset` for more information. 

166 

167 Use `add_task_subset` to add a new subset. The subsets themselves may 

168 be modified in-place. 

169 """ 

170 return self._task_subsets 

171 

172 @property 

173 def is_sorted(self) -> bool: 

174 """Whether this graph's tasks and dataset types are topologically 

175 sorted with the exact same deterministic tiebreakers that `sort` would 

176 apply. 

177 

178 This may perform (and then discard) a full sort if `has_been_sorted` is 

179 `False`. If the goal is to obtain a sorted graph, it is better to just 

180 call `sort` without guarding that with an ``if not graph.is_sorted`` 

181 check. 

182 """ 

183 if self._sorted_keys is not None: 

184 return True 

185 return all( 

186 sorted == unsorted 

187 for sorted, unsorted in zip( 

188 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True 

189 ) 

190 ) 

191 

192 @property 

193 def has_been_sorted(self) -> bool: 

194 """Whether this graph's tasks and dataset types have been 

195 topologically sorted (with unspecified but deterministic tiebreakers) 

196 since the last modification to the graph. 

197 

198 This may return `False` if the graph *happens* to be sorted but `sort` 

199 was never called, but it is potentially much faster than `is_sorted`, 

200 which may attempt (and then discard) a full sort if `has_been_sorted` 

201 is `False`. 

202 """ 

203 return self._sorted_keys is not None 

204 

205 def sort(self) -> None: 

206 """Sort this graph's nodes topologically with deterministic (but 

207 unspecified) tiebreakers. 

208 

209 This does nothing if the graph is already known to be sorted. 

210 """ 

211 if self._sorted_keys is None: 

212 try: 

213 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph)) 

214 except networkx.NetworkXUnfeasible as err: # pragma: no cover 

215 # Should't be possible to get here, because we check for cycles 

216 # when adding tasks, but we guard against it anyway. 

217 cycle = networkx.find_cycle(self._xgraph) 

218 raise PipelineDataCycleError( 

219 f"Cycle detected while attempting to sort graph: {cycle}." 

220 ) from err 

221 self._reorder(sorted_keys) 

222 

223 def copy(self) -> PipelineGraph: 

224 """Return a copy of this graph that copies all mutable state.""" 

225 xgraph = self._xgraph.copy() 

226 result = PipelineGraph.__new__(PipelineGraph) 

227 result._init_from_args( 

228 xgraph, 

229 self._sorted_keys, 

230 task_subsets={ 

231 k: TaskSubset(xgraph, v.label, set(v._members), v.description) 

232 for k, v in self._task_subsets.items() 

233 }, 

234 description=self._description, 

235 universe=self.universe, 

236 data_id=self._raw_data_id, 

237 ) 

238 return result 

239 

240 def __copy__(self) -> PipelineGraph: 

241 # Fully shallow copies are dangerous; we don't want shared mutable 

242 # state to lead to broken class invariants. 

243 return self.copy() 

244 

245 def __deepcopy__(self, memo: dict) -> PipelineGraph: 

246 # Genuine deep copies are unnecessary, since we should only ever care 

247 # that mutable state is copied. 

248 return self.copy() 

249 

250 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None: 

251 """Return the `WriteEdge` that links the producing task to the named 

252 dataset type. 

253 

254 Parameters 

255 ---------- 

256 dataset_type_name : `str` 

257 Dataset type name. Must not be a component. 

258 

259 Returns 

260 ------- 

261 edge : `WriteEdge` or `None` 

262 Producing edge or `None` if there isn't one in this graph. 

263 

264 Raises 

265 ------ 

266 DuplicateOutputError 

267 Raised if there are multiple tasks defined to produce this dataset 

268 type. This is only possible if the graph's dataset types are not 

269 resolved. 

270 

271 Notes 

272 ----- 

273 On resolved graphs, it may be slightly more efficient to use:: 

274 

275 graph.dataset_types[dataset_type_name].producing_edge 

276 

277 but this method works on graphs with unresolved dataset types as well. 

278 """ 

279 producer: str | None = None 

280 producing_edge: WriteEdge | None = None 

281 for _, _, producing_edge in self._xgraph.in_edges( 

282 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

283 ): 

284 assert producing_edge is not None, "Should only be None if we never loop." 

285 if producer is not None: 

286 raise DuplicateOutputError( 

287 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} " 

288 f"and {producer!r}." 

289 ) 

290 return producing_edge 

291 

292 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]: 

293 """Return the `ReadEdge` objects that link the named dataset type to 

294 the tasks that consume it. 

295 

296 Parameters 

297 ---------- 

298 dataset_type_name : `str` 

299 Dataset type name. Must not be a component. 

300 

301 Returns 

302 ------- 

303 edges : `list` [ `ReadEdge` ] 

304 Edges that connect this dataset type to the tasks that consume it. 

305 

306 Notes 

307 ----- 

308 On resolved graphs, it may be slightly more efficient to use:: 

309 

310 graph.dataset_types[dataset_type_name].producing_edges 

311 

312 but this method works on graphs with unresolved dataset types as well. 

313 """ 

314 return [ 

315 edge 

316 for _, _, edge in self._xgraph.out_edges( 

317 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

318 ) 

319 ] 

320 

321 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None: 

322 """Return the `TaskNode` or `TaskInitNode` that writes the given 

323 dataset type. 

324 

325 Parameters 

326 ---------- 

327 dataset_type_name : `str` 

328 Dataset type name. Must not be a component. 

329 

330 Returns 

331 ------- 

332 edge : `TaskNode`, `TaskInitNode`, or `None` 

333 Producing node or `None` if there isn't one in this graph. 

334 

335 Raises 

336 ------ 

337 DuplicateOutputError 

338 Raised if there are multiple tasks defined to produce this dataset 

339 type. This is only possible if the graph's dataset types are not 

340 resolved. 

341 """ 

342 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None: 

343 return self._xgraph.nodes[producing_edge.task_key]["instance"] 

344 return None 

345 

346 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]: 

347 """Return the `TaskNode` and/or `TaskInitNode` objects that read 

348 the given dataset type. 

349 

350 Parameters 

351 ---------- 

352 dataset_type_name : `str` 

353 Dataset type name. Must not be a component. 

354 

355 Returns 

356 ------- 

357 edges : `list` [ `ReadEdge` ] 

358 Edges that connect this dataset type to the tasks that consume it. 

359 

360 Notes 

361 ----- 

362 On resolved graphs, it may be slightly more efficient to use:: 

363 

364 graph.dataset_types[dataset_type_name].producing_edges 

365 

366 but this method works on graphs with unresolved dataset types as well. 

367 """ 

368 return [ 

369 self._xgraph.nodes[consuming_edge.task_key]["instance"] 

370 for consuming_edge in self.consuming_edges_of(dataset_type_name) 

371 ] 

372 

373 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]: 

374 """Return the dataset types that are inputs to a task. 

375 

376 Parameters 

377 ---------- 

378 task_label : `str` 

379 Label for the task in the pipeline. 

380 init : `bool`, optional 

381 If `True`, return init-input dataset types instead of runtime 

382 (including prerequisite) inputs. 

383 

384 Returns 

385 ------- 

386 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

387 Dictionary parent dataset type name keys and either 

388 `DatasetTypeNode` values (if the dataset type has been resolved) 

389 or `None` values. 

390 

391 Notes 

392 ----- 

393 To get the input edges of a task or task init node (which provide 

394 information about storage class overrides nd components) use:: 

395 

396 graph.tasks[task_label].iter_all_inputs() 

397 

398 or 

399 

400 graph.tasks[task_label].init.iter_all_inputs() 

401 

402 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

403 class. 

404 """ 

405 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

406 return { 

407 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

408 for edge in node.iter_all_inputs() 

409 } 

410 

411 def outputs_of( 

412 self, task_label: str, init: bool = False, include_automatic_connections: bool = True 

413 ) -> dict[str, DatasetTypeNode | None]: 

414 """Return the dataset types that are outputs of a task. 

415 

416 Parameters 

417 ---------- 

418 task_label : `str` 

419 Label for the task in the pipeline. 

420 init : `bool`, optional 

421 If `True`, return init-output dataset types instead of runtime 

422 outputs. 

423 include_automatic_connections : `bool`, optional 

424 Whether to include automatic connections such as configs, metadata, 

425 and logs. 

426 

427 Returns 

428 ------- 

429 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

430 Dictionary parent dataset type name keys and either 

431 `DatasetTypeNode` values (if the dataset type has been resolved) 

432 or `None` values. 

433 

434 Notes 

435 ----- 

436 To get the input edges of a task or task init node (which provide 

437 information about storage class overrides nd components) use:: 

438 

439 graph.tasks[task_label].iter_all_outputs() 

440 

441 or 

442 

443 graph.tasks[task_label].init.iter_all_outputs() 

444 

445 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

446 class. 

447 """ 

448 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

449 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values() 

450 return { 

451 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

452 for edge in iterable 

453 } 

454 

455 def resolve(self, registry: Registry) -> None: 

456 """Resolve all dimensions and dataset types and check them for 

457 consistency. 

458 

459 Resolving a graph also causes it to be sorted. 

460 

461 Parameters 

462 ---------- 

463 registry : `lsst.daf.butler.Registry` 

464 Client for the data repository to resolve against. 

465 

466 Notes 

467 ----- 

468 The `universe` attribute is set to ``registry.dimensions`` and used to 

469 set all `TaskNode.dimensions` attributes. Dataset type nodes are 

470 resolved by first looking for a registry definition, then using the 

471 producing task's definition, then looking for consistency between all 

472 consuming task definitions. 

473 

474 Raises 

475 ------ 

476 ConnectionTypeConsistencyError 

477 Raised if a prerequisite input for one task appears as a different 

478 kind of connection in any other task. 

479 DuplicateOutputError 

480 Raised if multiple tasks have the same dataset type as an output. 

481 IncompatibleDatasetTypeError 

482 Raised if different tasks have different definitions of a dataset 

483 type. Different but compatible storage classes are permitted. 

484 MissingDatasetTypeError 

485 Raised if a dataset type definition is required to exist in the 

486 data repository but none was found. This should only occur for 

487 dataset types that are not produced by a task in the pipeline and 

488 are consumed with different storage classes or as components by 

489 tasks in the pipeline. 

490 EdgesChangedError 

491 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

492 change after import and reconfiguration. 

493 """ 

494 node_key: NodeKey 

495 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {} 

496 for node_key, node_state in self._xgraph.nodes.items(): 

497 match node_key.node_type: 

498 case NodeType.TASK: 

499 task_node: TaskNode = node_state["instance"] 

500 new_task_node = task_node._resolved(registry.dimensions) 

501 if new_task_node is not task_node: 

502 updates[node_key] = new_task_node 

503 case NodeType.DATASET_TYPE: 

504 dataset_type_node: DatasetTypeNode | None = node_state["instance"] 

505 new_dataset_type_node = DatasetTypeNode._from_edges( 

506 node_key, self._xgraph, registry, previous=dataset_type_node 

507 ) 

508 # Usage of `is`` here is intentional; `_from_edges` returns 

509 # `previous=dataset_type_node` if it can determine that it 

510 # doesn't need to change. 

511 if new_dataset_type_node is not dataset_type_node: 

512 updates[node_key] = new_dataset_type_node 

513 try: 

514 for node_key, node_value in updates.items(): 

515 self._xgraph.nodes[node_key]["instance"] = node_value 

516 except Exception as err: # pragma: no cover 

517 # There's no known way to get here, but we want to make it 

518 # clear it's a big problem if we do. 

519 raise PipelineGraphExceptionSafetyError( 

520 "Error during dataset type resolution has left the graph in an inconsistent state." 

521 ) from err 

522 self.sort() 

523 self._universe = registry.dimensions 

524 

525 ########################################################################### 

526 # 

527 # Graph Modification Interface: 

528 # 

529 # - methods to add, remove, and replace tasks; 

530 # 

531 # - methods to add and remove task subsets. 

532 # 

533 # These are all things that are usually done in a Pipeline before making a 

534 # graph at all, but there may be cases where we want to modify the graph 

535 # instead. (These are also the methods used to make a graph from a 

536 # Pipeline, or make a graph from another graph.) 

537 # 

538 ########################################################################### 

539 

540 def add_task( 

541 self, 

542 label: str, 

543 task_class: type[PipelineTask], 

544 config: PipelineTaskConfig, 

545 connections: PipelineTaskConnections | None = None, 

546 ) -> TaskNode: 

547 """Add a new task to the graph. 

548 

549 Parameters 

550 ---------- 

551 label : `str` 

552 Label for the task in the pipeline. 

553 task_class : `type` [ `PipelineTask` ] 

554 Class object for the task. 

555 config : `PipelineTaskConfig` 

556 Configuration for the task. 

557 connections : `PipelineTaskConnections`, optional 

558 Object that describes the dataset types used by the task. If not 

559 provided, one will be constructed from the given configuration. If 

560 provided, it is assumed that ``config`` has already been validated 

561 and frozen. 

562 

563 Returns 

564 ------- 

565 node : `TaskNode` 

566 The new task node added to the graph. 

567 

568 Raises 

569 ------ 

570 ValueError 

571 Raised if configuration validation failed when constructing 

572 ``connections``. 

573 PipelineDataCycleError 

574 Raised if the graph is cyclic after this addition. 

575 RuntimeError 

576 Raised if an unexpected exception (which will be chained) occurred 

577 at a stage that may have left the graph in an inconsistent state. 

578 Other exceptions should leave the graph unchanged. 

579 

580 Notes 

581 ----- 

582 Checks for dataset type consistency and multiple producers do not occur 

583 until `resolve` is called, since the resolution depends on both the 

584 state of the data repository and all contributing tasks. 

585 

586 Adding new tasks removes any existing resolutions of all dataset types 

587 it references and marks the graph as unsorted. It is most effiecient 

588 to add all tasks up front and only then resolve and/or sort the graph. 

589 """ 

590 task_node = TaskNode._from_imported_data( 

591 key=NodeKey(NodeType.TASK, label), 

592 init_key=NodeKey(NodeType.TASK_INIT, label), 

593 data=_TaskNodeImportedData.configure(label, task_class, config, connections), 

594 universe=self.universe, 

595 ) 

596 self.add_task_nodes([task_node]) 

597 return task_node 

598 

599 def add_task_nodes(self, nodes: Iterable[TaskNode]) -> None: 

600 """Add one or more existing task nodes to the graph. 

601 

602 Parameters 

603 ---------- 

604 nodes : `~collections.abc.Iterable` [ `TaskNode` ] 

605 Iterable of task nodes to add. If any tasks have resolved 

606 dimensions, they must have the same dimension universe as the rest 

607 of the graph. 

608 

609 Raises 

610 ------ 

611 PipelineDataCycleError 

612 Raised if the graph is cyclic after this addition. 

613 

614 Notes 

615 ----- 

616 Checks for dataset type consistency and multiple producers do not occur 

617 until `resolve` is called, since the resolution depends on both the 

618 state of the data repository and all contributing tasks. 

619 

620 Adding new tasks removes any existing resolutions of all dataset types 

621 it references and marks the graph as unsorted. It is most effiecient 

622 to add all tasks up front and only then resolve and/or sort the graph. 

623 """ 

624 node_data: list[tuple[NodeKey, dict[str, Any]]] = [] 

625 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = [] 

626 for task_node in nodes: 

627 task_node = task_node._resolved(self._universe) 

628 node_data.append( 

629 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite}) 

630 ) 

631 node_data.append( 

632 ( 

633 task_node.init.key, 

634 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite}, 

635 ) 

636 ) 

637 # Convert the edge objects attached to the task node to networkx. 

638 for read_edge in task_node.init.iter_all_inputs(): 

639 self._append_graph_data_from_edge(node_data, edge_data, read_edge) 

640 for write_edge in task_node.init.iter_all_outputs(): 

641 self._append_graph_data_from_edge(node_data, edge_data, write_edge) 

642 for read_edge in task_node.iter_all_inputs(): 

643 self._append_graph_data_from_edge(node_data, edge_data, read_edge) 

644 for write_edge in task_node.iter_all_outputs(): 

645 self._append_graph_data_from_edge(node_data, edge_data, write_edge) 

646 # Add a special edge (with no Edge instance) that connects the 

647 # TaskInitNode to the runtime TaskNode. 

648 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None})) 

649 if not node_data and not edge_data: 

650 return 

651 # Checks and preparation complete; time to start the actual 

652 # modification, during which it's hard to provide strong exception 

653 # safety. Start by resetting the sort ordering, if there is one. 

654 self._reset() 

655 try: 

656 self._xgraph.add_nodes_from(node_data) 

657 self._xgraph.add_edges_from(edge_data) 

658 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph): 

659 cycle = networkx.find_cycle(self._xgraph) 

660 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.") 

661 except Exception: 

662 # First try to roll back our changes. 

663 try: 

664 self._xgraph.remove_edges_from(edge_data) 

665 self._xgraph.remove_nodes_from(key for key, _ in node_data) 

666 except Exception as err: # pragma: no cover 

667 # There's no known way to get here, but we want to make it 

668 # clear it's a big problem if we do. 

669 raise PipelineGraphExceptionSafetyError( 

670 "Error while attempting to revert PipelineGraph modification has left the graph in " 

671 "an inconsistent state." 

672 ) from err 

673 # Successfully rolled back; raise the original exception. 

674 raise 

675 

676 def reconfigure_tasks( 

677 self, 

678 *args: tuple[str, PipelineTaskConfig], 

679 check_edges_unchanged: bool = False, 

680 assume_edges_unchanged: bool = False, 

681 **kwargs: PipelineTaskConfig, 

682 ) -> None: 

683 """Update the configuration for one or more tasks. 

684 

685 Parameters 

686 ---------- 

687 *args : `tuple` [ `str`, `.PipelineTaskConfig` ] 

688 Positional arguments are each a 2-tuple of task label and new 

689 config object. Note that the same arguments may also be passed as 

690 ``**kwargs``, which is usually more readable, but task labels in 

691 ``*args`` are not required to be valid Python identifiers. 

692 check_edges_unchanged : `bool`, optional 

693 If `True`, require the edges (connections) of the modified tasks to 

694 remain unchanged after the configuration updates, and verify that 

695 this is the case. 

696 assume_edges_unchanged : `bool`, optional 

697 If `True`, the caller declares that the edges (connections) of the 

698 modified tasks will remain unchanged after the configuration 

699 updates, and that it is unnecessary to check this. 

700 **kwargs : `.PipelineTaskConfig` 

701 New config objects or overrides to apply to copies of the current 

702 config objects, with task labels as the keywords. 

703 

704 Raises 

705 ------ 

706 ValueError 

707 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

708 are both `True`, or if the same task appears twice. 

709 EdgesChangedError 

710 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

711 change. 

712 

713 Notes 

714 ----- 

715 If reconfiguring a task causes its edges to change, any dataset type 

716 nodes connected to that task (not just those whose edges have changed!) 

717 will be unresolved. 

718 """ 

719 new_configs: dict[str, PipelineTaskConfig] = {} 

720 for task_label, config_update in itertools.chain(args, kwargs.items()): 

721 if new_configs.setdefault(task_label, config_update) is not config_update: 

722 raise ValueError(f"Config for {task_label!r} provided more than once.") 

723 updates = { 

724 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged) 

725 for task_label, config in new_configs.items() 

726 } 

727 self._replace_task_nodes( 

728 updates, 

729 check_edges_unchanged=check_edges_unchanged, 

730 assume_edges_unchanged=assume_edges_unchanged, 

731 message_header=( 

732 "Unexpected change in edges for task {task_label!r} from original config (A) to " 

733 "new configs (B):" 

734 ), 

735 ) 

736 

737 def remove_tasks( 

738 self, labels: Iterable[str], drop_from_subsets: bool = True 

739 ) -> list[tuple[TaskNode, set[str]]]: 

740 """Remove one or more tasks from the graph. 

741 

742 Parameters 

743 ---------- 

744 labels : `~collections.abc.Iterable` [ `str` ] 

745 Iterable of the labels of the tasks to remove. 

746 drop_from_subsets : `bool`, optional 

747 If `True`, drop each removed task from any subset in which it 

748 currently appears. If `False`, raise `PipelineGraphError` if any 

749 such subsets exist. 

750 

751 Returns 

752 ------- 

753 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ] 

754 List of nodes removed and the labels of task subsets that 

755 referenced them. 

756 

757 Raises 

758 ------ 

759 PipelineGraphError 

760 Raised if ``drop_from_subsets`` is `False` and the task is still 

761 part of one or more subsets. 

762 

763 Notes 

764 ----- 

765 Removing a task will cause dataset nodes with no other referencing 

766 tasks to be removed. Any other dataset type nodes referenced by a 

767 removed task will be reset to an "unresolved" state. 

768 """ 

769 task_nodes_and_subsets = [] 

770 dataset_types: set[NodeKey] = set() 

771 nodes_to_remove = set() 

772 for label in labels: 

773 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"] 

774 # Find task subsets that reference this task. 

775 referencing_subsets = { 

776 subset_label 

777 for subset_label, task_subset in self.task_subsets.items() 

778 if label in task_subset 

779 } 

780 if not drop_from_subsets and referencing_subsets: 

781 raise PipelineGraphError( 

782 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}." 

783 ) 

784 task_nodes_and_subsets.append((task_node, referencing_subsets)) 

785 # Find dataset types referenced by this task. 

786 dataset_types.update(self._xgraph.predecessors(task_node.key)) 

787 dataset_types.update(self._xgraph.successors(task_node.key)) 

788 dataset_types.update(self._xgraph.predecessors(task_node.init.key)) 

789 dataset_types.update(self._xgraph.successors(task_node.init.key)) 

790 # Since there's an edge between the task and its init node, we'll 

791 # have added those two nodes here, too, and we don't want that. 

792 dataset_types.remove(task_node.init.key) 

793 dataset_types.remove(task_node.key) 

794 # Mark the task node and its init node for removal from the graph. 

795 nodes_to_remove.add(task_node.key) 

796 nodes_to_remove.add(task_node.init.key) 

797 # Process the referenced datasets to see which ones are orphaned and 

798 # need to be removed vs. just unresolved. 

799 nodes_to_unresolve = [] 

800 for dataset_type_key in dataset_types: 

801 related_tasks = set() 

802 related_tasks.update(self._xgraph.predecessors(dataset_type_key)) 

803 related_tasks.update(self._xgraph.successors(dataset_type_key)) 

804 related_tasks.difference_update(nodes_to_remove) 

805 if not related_tasks: 

806 nodes_to_remove.add(dataset_type_key) 

807 else: 

808 nodes_to_unresolve.append(dataset_type_key) 

809 # Checks and preparation complete; time to start the actual 

810 # modification, during which it's hard to provide strong exception 

811 # safety. Start by resetting the sort ordering. 

812 self._reset() 

813 try: 

814 for dataset_type_key in nodes_to_unresolve: 

815 self._xgraph.nodes[dataset_type_key]["instance"] = None 

816 for task_node, referencing_subsets in task_nodes_and_subsets: 

817 for subset_label in referencing_subsets: 

818 self._task_subsets[subset_label].remove(task_node.label) 

819 self._xgraph.remove_nodes_from(nodes_to_remove) 

820 except Exception as err: # pragma: no cover 

821 # There's no known way to get here, but we want to make it 

822 # clear it's a big problem if we do. 

823 raise PipelineGraphExceptionSafetyError( 

824 "Error during task removal has left the graph in an inconsistent state." 

825 ) from err 

826 return task_nodes_and_subsets 

827 

828 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None: 

829 """Add a label for a set of tasks that are already in the pipeline. 

830 

831 Parameters 

832 ---------- 

833 subset_label : `str` 

834 Label for this set of tasks. 

835 task_labels : `~collections.abc.Iterable` [ `str` ] 

836 Labels of the tasks to include in the set. All must already be 

837 included in the graph. 

838 description : `str`, optional 

839 String description to associate with this label. 

840 """ 

841 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description) 

842 self._task_subsets[subset_label] = subset 

843 

844 def remove_task_subset(self, subset_label: str) -> None: 

845 """Remove a labeled set of tasks.""" 

846 del self._task_subsets[subset_label] 

847 

848 ########################################################################### 

849 # 

850 # NetworkX Export Interface: 

851 # 

852 # - methods to export the PipelineGraph's content (or various subsets 

853 # thereof) as NetworkX objects. 

854 # 

855 # These are particularly useful when writing tools to visualize the graph, 

856 # while providing options for which aspects of the graph (tasks, dataset 

857 # types, or both) to include, since all exported graphs have similar 

858 # attributes regardless of their structure. 

859 # 

860 ########################################################################### 

861 

862 def make_xgraph(self) -> networkx.MultiDiGraph: 

863 """Export a networkx representation of the full pipeline graph, 

864 including both init and runtime edges. 

865 

866 Returns 

867 ------- 

868 xgraph : `networkx.MultiDiGraph` 

869 Directed acyclic graph with parallel edges. 

870 

871 Notes 

872 ----- 

873 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

874 represent the same dataset type appearing in multiple connections for 

875 the same task, and are hence rare. The connection name is used as the 

876 edge key to disambiguate those parallel edges. 

877 

878 Almost all edges connect dataset type nodes to task or task init nodes 

879 or vice versa, but there is also a special edge that connects each task 

880 init node to its runtime node. The existence of these edges makes the 

881 graph not quite bipartite, though its init-only and runtime-only 

882 subgraphs are bipartite. 

883 

884 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

885 `WriteEdge` for the descriptive node and edge attributes added. 

886 """ 

887 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False) 

888 

889 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph: 

890 """Return a bipartite networkx representation of just the runtime or 

891 init-time pipeline graph. 

892 

893 Parameters 

894 ---------- 

895 init : `bool`, optional 

896 If `True` (`False` is default) return the graph of task 

897 initialization nodes and init input/output dataset types, instead 

898 of the graph of runtime task nodes and regular 

899 input/output/prerequisite dataset types. 

900 

901 Returns 

902 ------- 

903 xgraph : `networkx.MultiDiGraph` 

904 Directed acyclic graph with parallel edges. 

905 

906 Notes 

907 ----- 

908 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

909 represent the same dataset type appearing in multiple connections for 

910 the same task, and are hence rare. The connection name is used as the 

911 edge key to disambiguate those parallel edges. 

912 

913 This graph is bipartite because each dataset type node only has edges 

914 that connect it to a task [init] node, and vice versa. 

915 

916 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

917 `WriteEdge` for the descriptive node and edge attributes added. 

918 """ 

919 return self._transform_xgraph_state( 

920 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False 

921 ) 

922 

923 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph: 

924 """Return a networkx representation of just the tasks in the pipeline. 

925 

926 Parameters 

927 ---------- 

928 init : `bool`, optional 

929 If `True` (`False` is default) return the graph of task 

930 initialization nodes, instead of the graph of runtime task nodes. 

931 

932 Returns 

933 ------- 

934 xgraph : `networkx.DiGraph` 

935 Directed acyclic graph with no parallel edges. 

936 

937 Notes 

938 ----- 

939 The returned graph uses `NodeKey` instances for nodes. The dataset 

940 types that link these tasks are not represented at all; edges have no 

941 attributes, and there are no parallel edges. 

942 

943 See `TaskNode` and `TaskInitNode` for the descriptive node and 

944 attributes added. 

945 """ 

946 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

947 task_keys = [ 

948 key 

949 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

950 if bipartite == NodeType.TASK.bipartite 

951 ] 

952 return self._transform_xgraph_state( 

953 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys), 

954 skip_edges=True, 

955 ) 

956 

957 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph: 

958 """Return a networkx representation of just the dataset types in the 

959 pipeline. 

960 

961 Parameters 

962 ---------- 

963 init : `bool`, optional 

964 If `True` (`False` is default) return the graph of init input and 

965 output dataset types, instead of the graph of runtime (input, 

966 output, prerequisite input) dataset types. 

967 

968 Returns 

969 ------- 

970 xgraph : `networkx.DiGraph` 

971 Directed acyclic graph with no parallel edges. 

972 

973 Notes 

974 ----- 

975 The returned graph uses `NodeKey` instances for nodes. The tasks that 

976 link these tasks are not represented at all; edges have no attributes, 

977 and there are no parallel edges. 

978 

979 See `DatasetTypeNode` for the descriptive node and attributes added. 

980 """ 

981 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

982 dataset_type_keys = [ 

983 key 

984 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

985 if bipartite == NodeType.DATASET_TYPE.bipartite 

986 ] 

987 return self._transform_xgraph_state( 

988 networkx.algorithms.bipartite.projected_graph( 

989 networkx.DiGraph(bipartite_xgraph), dataset_type_keys 

990 ), 

991 skip_edges=True, 

992 ) 

993 

994 ########################################################################### 

995 # 

996 # Serialization Interface. 

997 # 

998 # Serialization of PipelineGraphs is currently experimental and may not be 

999 # retained in the future. All serialization methods are 

1000 # underscore-prefixed to ensure nobody mistakes them for a stable interface 

1001 # (let a lone a stable file format). 

1002 # 

1003 ########################################################################### 

1004 

1005 @classmethod 

1006 def _read_stream( 

1007 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1008 ) -> PipelineGraph: 

1009 """Read a serialized `PipelineGraph` from a file-like object. 

1010 

1011 Parameters 

1012 ---------- 

1013 stream : `BinaryIO` 

1014 File-like object opened for binary reading, containing 

1015 gzip-compressed JSON. 

1016 import_mode : `TaskImportMode`, optional 

1017 Whether to import tasks, and how to reconcile any differences 

1018 between the imported task's connections and the those that were 

1019 persisted with the graph. Default is to check that they are the 

1020 same. 

1021 

1022 Returns 

1023 ------- 

1024 graph : `PipelineGraph` 

1025 Deserialized pipeline graph. 

1026 

1027 Raises 

1028 ------ 

1029 PipelineGraphReadError 

1030 Raised if the serialized `PipelineGraph` is not self-consistent. 

1031 EdgesChangedError 

1032 Raised if ``import_mode`` is 

1033 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1034 did change after import and reconfiguration. 

1035 

1036 Notes 

1037 ----- 

1038 `PipelineGraph` serialization is currently experimental and may be 

1039 removed or significantly changed in the future, with no deprecation 

1040 period. 

1041 """ 

1042 from .io import SerializedPipelineGraph 

1043 

1044 with gzip.open(stream, "rb") as uncompressed_stream: 

1045 data = json.load(uncompressed_stream) 

1046 serialized_graph = SerializedPipelineGraph.parse_obj(data) 

1047 return serialized_graph.deserialize(import_mode) 

1048 

1049 @classmethod 

1050 def _read_uri( 

1051 cls, 

1052 uri: ResourcePathExpression, 

1053 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES, 

1054 ) -> PipelineGraph: 

1055 """Read a serialized `PipelineGraph` from a file at a URI. 

1056 

1057 Parameters 

1058 ---------- 

1059 uri : convertible to `lsst.resources.ResourcePath` 

1060 URI to a gzip-compressed JSON file containing a serialized pipeline 

1061 graph. 

1062 import_mode : `TaskImportMode`, optional 

1063 Whether to import tasks, and how to reconcile any differences 

1064 between the imported task's connections and the those that were 

1065 persisted with the graph. Default is to check that they are the 

1066 same. 

1067 

1068 Returns 

1069 ------- 

1070 graph : `PipelineGraph` 

1071 Deserialized pipeline graph. 

1072 

1073 Raises 

1074 ------ 

1075 PipelineGraphReadError 

1076 Raised if the serialized `PipelineGraph` is not self-consistent. 

1077 EdgesChangedError 

1078 Raised if ``import_mode`` is 

1079 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1080 did change after import and reconfiguration. 

1081 

1082 Notes 

1083 ----- 

1084 `PipelineGraph` serialization is currently experimental and may be 

1085 removed or significantly changed in the future, with no deprecation 

1086 period. 

1087 """ 

1088 uri = ResourcePath(uri) 

1089 with uri.open("rb") as stream: 

1090 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode) 

1091 

1092 def _write_stream(self, stream: BinaryIO) -> None: 

1093 """Write the pipeline to a file-like object. 

1094 

1095 Parameters 

1096 ---------- 

1097 stream 

1098 File-like object opened for binary writing. 

1099 

1100 Notes 

1101 ----- 

1102 `PipelineGraph` serialization is currently experimental and may be 

1103 removed or significantly changed in the future, with no deprecation 

1104 period. 

1105 

1106 The file format is gzipped JSON, and is intended to be human-readable, 

1107 but it should not be considered a stable public interface for outside 

1108 code, which should always use `PipelineGraph` methods (or at least the 

1109 `io.SerializedPipelineGraph` class) to read these files. 

1110 """ 

1111 from .io import SerializedPipelineGraph 

1112 

1113 with gzip.open(stream, mode="wb") as compressed_stream: 

1114 compressed_stream.write( 

1115 SerializedPipelineGraph.serialize(self).json(exclude_defaults=True).encode("utf-8") 

1116 ) 

1117 

1118 def _write_uri(self, uri: ResourcePathExpression) -> None: 

1119 """Write the pipeline to a file given a URI. 

1120 

1121 Parameters 

1122 ---------- 

1123 uri : convertible to `lsst.resources.ResourcePath` 

1124 URI to write to . May have ``.json.gz`` or no extension (which 

1125 will cause a ``.json.gz`` extension to be added). 

1126 

1127 Notes 

1128 ----- 

1129 `PipelineGraph` serialization is currently experimental and may be 

1130 removed or significantly changed in the future, with no deprecation 

1131 period. 

1132 

1133 The file format is gzipped JSON, and is intended to be human-readable, 

1134 but it should not be considered a stable public interface for outside 

1135 code, which should always use `PipelineGraph` methods (or at least the 

1136 `io.SerializedPipelineGraph` class) to read these files. 

1137 """ 

1138 uri = ResourcePath(uri) 

1139 extension = uri.getExtension() 

1140 if not extension: 

1141 uri = uri.updatedExtension(".json.gz") 

1142 elif extension != ".json.gz": 

1143 raise ValueError("Expanded pipeline files should always have a .json.gz extension.") 

1144 with uri.open(mode="wb") as stream: 

1145 self._write_stream(cast(BinaryIO, stream)) 

1146 

1147 def _import_and_configure( 

1148 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1149 ) -> None: 

1150 """Import the `PipelineTask` classes referenced by all task nodes and 

1151 update those nodes accordingly. 

1152 

1153 Parameters 

1154 ---------- 

1155 import_mode : `TaskImportMode`, optional 

1156 Whether to import tasks, and how to reconcile any differences 

1157 between the imported task's connections and the those that were 

1158 persisted with the graph. Default is to check that they are the 

1159 same. This method does nothing if this is 

1160 `TaskImportMode.DO_NOT_IMPORT`. 

1161 

1162 Raises 

1163 ------ 

1164 EdgesChangedError 

1165 Raised if ``import_mode`` is 

1166 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1167 did change after import and reconfiguration. 

1168 

1169 Notes 

1170 ----- 

1171 This method shouldn't need to be called unless the graph was 

1172 deserialized without importing and configuring immediately, which is 

1173 not the default behavior (but it can greatly speed up deserialization). 

1174 If all tasks have already been imported this does nothing. 

1175 

1176 Importing and configuring a task can change its 

1177 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output, 

1178 usually because the software used to read a serialized graph is newer 

1179 than the software used to write it (e.g. a new config option has been 

1180 added, or the task was moved to a new module with a forwarding alias 

1181 left behind). These changes are allowed by 

1182 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`. 

1183 

1184 If importing and configuring a task causes its edges to change, any 

1185 dataset type nodes linked to those edges will be reset to the 

1186 unresolved state. 

1187 """ 

1188 if import_mode is TaskImportMode.DO_NOT_IMPORT: 

1189 return 

1190 rebuild = ( 

1191 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1192 or import_mode is TaskImportMode.OVERRIDE_EDGES 

1193 ) 

1194 updates: dict[str, TaskNode] = {} 

1195 node_key: NodeKey 

1196 for node_key, node_state in self._xgraph.nodes.items(): 

1197 if node_key.node_type is NodeType.TASK: 

1198 task_node: TaskNode = node_state["instance"] 

1199 new_task_node = task_node._imported_and_configured(rebuild) 

1200 if new_task_node is not task_node: 

1201 updates[task_node.label] = new_task_node 

1202 self._replace_task_nodes( 

1203 updates, 

1204 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES), 

1205 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES), 

1206 message_header=( 

1207 "In task with label {task_label!r}, persisted edges (A)" 

1208 "differ from imported and configured edges (B):" 

1209 ), 

1210 ) 

1211 

1212 ########################################################################### 

1213 # 

1214 # Advanced PipelineGraph Inspection Interface: 

1215 # 

1216 # - methods to iterate over all nodes and edges, utilizing NodeKeys; 

1217 # 

1218 # - methods to find overall inputs and group nodes by their dimensions, 

1219 # which are important operations for QuantumGraph generation. 

1220 # 

1221 ########################################################################### 

1222 

1223 def iter_edges(self, init: bool = False) -> Iterator[Edge]: 

1224 """Iterate over edges in the graph. 

1225 

1226 Parameters 

1227 ---------- 

1228 init : `bool`, optional 

1229 If `True` (`False` is default) iterate over the edges between task 

1230 initialization node and init input/output dataset types, instead of 

1231 the runtime task nodes and regular input/output/prerequisite 

1232 dataset types. 

1233 

1234 Returns 

1235 ------- 

1236 edges : `~collections.abc.Iterator` [ `Edge` ] 

1237 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances. 

1238 

1239 Notes 

1240 ----- 

1241 This method always returns _either_ init edges or runtime edges, never 

1242 both. The full (internal) graph that contains both also includes a 

1243 special edge that connects each task init node to its runtime node; 

1244 that is also never returned by this method, since it is never a part of 

1245 the init-only or runtime-only subgraphs. 

1246 """ 

1247 edge: Edge 

1248 for _, _, edge in self._xgraph.edges(data="instance"): 

1249 if edge is not None and edge.is_init == init: 

1250 yield edge 

1251 

1252 def iter_nodes( 

1253 self, 

1254 ) -> Iterator[ 

1255 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode] 

1256 | tuple[Literal[NodeType.TASK], str, TaskInitNode] 

1257 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None] 

1258 ]: 

1259 """Iterate over nodes in the graph. 

1260 

1261 Returns 

1262 ------- 

1263 nodes : `~collections.abc.Iterator` [ `tuple` ] 

1264 A lazy iterator over all of the nodes in the graph. Each yielded 

1265 element is a tuple of: 

1266 

1267 - the node type enum value (`NodeType`); 

1268 - the string name for the node (task label or parent dataset type 

1269 name); 

1270 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`, 

1271 or `None` for dataset type nodes that have not been resolved). 

1272 """ 

1273 key: NodeKey 

1274 if self._sorted_keys is not None: 

1275 for key in self._sorted_keys: 

1276 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore 

1277 else: 

1278 for key, node in self._xgraph.nodes(data="instance"): 

1279 yield key.node_type, key.name, node # type: ignore 

1280 

1281 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]: 

1282 """Iterate over all of the dataset types that are consumed but not 

1283 produced by the graph. 

1284 

1285 Returns 

1286 ------- 

1287 dataset_types : `~collections.abc.Iterator` [ `tuple` ] 

1288 A lazy iterator over the overall-input dataset types (including 

1289 overall init inputs and prerequisites). Each yielded element is a 

1290 tuple of: 

1291 

1292 - the parent dataset type name; 

1293 - the resolved `DatasetTypeNode`, or `None` if the dataset type has 

1294 - not been resolved. 

1295 """ 

1296 for generation in networkx.algorithms.dag.topological_generations(self._xgraph): 

1297 key: NodeKey 

1298 for key in generation: 

1299 # While we expect all tasks to have at least one input and 

1300 # hence never appear in the first topological generation, that 

1301 # is not true of task init nodes. 

1302 if key.node_type is NodeType.DATASET_TYPE: 

1303 yield key.name, self._xgraph.nodes[key]["instance"] 

1304 return 

1305 

1306 def group_by_dimensions( 

1307 self, prerequisites: bool = False 

1308 ) -> dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]: 

1309 """Group this graph's tasks and dataset types by their dimensions. 

1310 

1311 Parameters 

1312 ---------- 

1313 prerequisites : `bool`, optional 

1314 If `True`, include prerequisite dataset types as well as regular 

1315 input and output datasets (including intermediates). 

1316 

1317 Returns 

1318 ------- 

1319 groups : `dict` [ `DimensionGraph`, `tuple` ] 

1320 A dictionary of groups keyed by `DimensionGraph`, in which each 

1321 value is a tuple of: 

1322 

1323 - a `dict` of `TaskNode` instances, keyed by task label 

1324 - a `dict` of `DatasetTypeNode` instances, keyed by 

1325 dataset type name. 

1326 

1327 that have those dimensions. 

1328 

1329 Notes 

1330 ----- 

1331 Init inputs and outputs are always included, but always have empty 

1332 dimensions and are hence are all grouped together. 

1333 """ 

1334 result: dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {} 

1335 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {}) 

1336 for task_label, task_node in self.tasks.items(): 

1337 if task_node.dimensions is None: 

1338 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.") 

1339 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value: 

1340 next_new_value = ({}, {}) # make new lists for next time 

1341 group[0][task_node.label] = task_node 

1342 for dataset_type_name, dataset_type_node in self.dataset_types.items(): 

1343 if dataset_type_node is None: 

1344 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.") 

1345 if not dataset_type_node.is_prerequisite or prerequisites: 

1346 if ( 

1347 group := result.setdefault(dataset_type_node.dataset_type.dimensions, next_new_value) 

1348 ) is next_new_value: 

1349 next_new_value = ({}, {}) # make new lists for next time 

1350 group[1][dataset_type_node.name] = dataset_type_node 

1351 return result 

1352 

1353 ########################################################################### 

1354 # 

1355 # Class- and Package-Private Methods. 

1356 # 

1357 ########################################################################### 

1358 

1359 def _iter_task_defs(self) -> Iterator[TaskDef]: 

1360 """Iterate over this pipeline as a sequence of `TaskDef` instances. 

1361 

1362 Notes 

1363 ----- 

1364 This is a package-private method intended to aid in the transition to a 

1365 codebase more fully integrated with the `PipelineGraph` class, in which 

1366 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and 

1367 much of the functionality on the `Pipeline` class will be moved to 

1368 `PipelineGraph` as well. 

1369 

1370 Raises 

1371 ------ 

1372 TaskNotImportedError 

1373 Raised if `TaskNode.is_imported` is `False` for any task. 

1374 """ 

1375 from ..pipeline import TaskDef 

1376 

1377 for node in self._tasks.values(): 

1378 yield TaskDef( 

1379 config=node.config, 

1380 taskClass=node.task_class, 

1381 label=node.label, 

1382 connections=node._get_imported_data().connections, 

1383 ) 

1384 

1385 def _init_from_args( 

1386 self, 

1387 xgraph: networkx.MultiDiGraph | None, 

1388 sorted_keys: Sequence[NodeKey] | None, 

1389 task_subsets: dict[str, TaskSubset] | None, 

1390 description: str, 

1391 universe: DimensionUniverse | None, 

1392 data_id: DataId | None, 

1393 ) -> None: 

1394 """Initialize the graph with possibly-nontrivial arguments. 

1395 

1396 Parameters 

1397 ---------- 

1398 xgraph : `networkx.MultiDiGraph` or `None` 

1399 The backing networkx graph, or `None` to create an empty one. 

1400 This graph has `NodeKey` instances for nodes and the same structure 

1401 as the graph exported by `make_xgraph`, but its nodes and edges 

1402 have a single ``instance`` attribute that holds a `TaskNode`, 

1403 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or 

1404 `WriteEdge` instance. 

1405 sorted_keys : `Sequence` [ `NodeKey` ] or `None` 

1406 Topologically sorted sequence of node keys, or `None` if the graph 

1407 is not sorted. 

1408 task_subsets : `dict` [ `str`, `TaskSubset` ] 

1409 Labeled subsets of tasks. Values must be constructed with 

1410 ``xgraph`` as their parent graph. 

1411 description : `str` 

1412 String description for this pipeline. 

1413 universe : `lsst.daf.butler.DimensionUniverse` or `None` 

1414 Definitions of all dimensions. 

1415 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping. 

1416 Data ID that represents a constraint on all quanta generated from 

1417 this pipeline. 

1418 

1419 Notes 

1420 ----- 

1421 Only empty `PipelineGraph` instances should be constructed directly by 

1422 users, which sets the signature of ``__init__`` itself, but methods on 

1423 `PipelineGraph` and its helper classes need to be able to create them 

1424 with state. Those methods can call this after calling ``__new__`` 

1425 manually, skipping ``__init__``. 

1426 """ 

1427 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph() 

1428 self._sorted_keys: Sequence[NodeKey] | None = None 

1429 self._task_subsets = task_subsets if task_subsets is not None else {} 

1430 self._description = description 

1431 self._tasks = TaskMappingView(self._xgraph) 

1432 self._dataset_types = DatasetTypeMappingView(self._xgraph) 

1433 self._raw_data_id: dict[str, Any] 

1434 if isinstance(data_id, DataCoordinate): 

1435 if universe is None: 

1436 universe = data_id.universe 

1437 else: 

1438 assert universe is data_id.universe, "data_id.universe and given universe differ" 

1439 self._raw_data_id = data_id.byName() 

1440 elif data_id is None: 

1441 self._raw_data_id = {} 

1442 else: 

1443 self._raw_data_id = dict(data_id) 

1444 self._universe = universe 

1445 if sorted_keys is not None: 

1446 self._reorder(sorted_keys) 

1447 

1448 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph: 

1449 """Make a bipartite init-only or runtime-only internal subgraph. 

1450 

1451 See `make_bipartite_xgraph` for parameters and return values. 

1452 

1453 Notes 

1454 ----- 

1455 This method returns a view of the `PipelineGraph` object's internal 

1456 backing graph, and hence should only be called in methods that copy the 

1457 result either explicitly or by running a copying algorithm before 

1458 returning it to the user. 

1459 """ 

1460 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)]) 

1461 

1462 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G: 

1463 """Transform networkx graph attributes in-place from the internal 

1464 "instance" attributes to the documented exported attributes. 

1465 

1466 Parameters 

1467 ---------- 

1468 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1469 Graph whose state should be transformed. 

1470 skip_edges : `bool` 

1471 If `True`, do not transform edge state. 

1472 

1473 Returns 

1474 ------- 

1475 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1476 The same object passed in, after modification. 

1477 

1478 Notes 

1479 ----- 

1480 This should be called after making a copy of the internal graph but 

1481 before any projection down to just task or dataset type nodes, since 

1482 it assumes stateful edges. 

1483 """ 

1484 state: dict[str, Any] 

1485 for state in xgraph.nodes.values(): 

1486 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance") 

1487 if node_value is not None: 

1488 state.update(node_value._to_xgraph_state()) 

1489 if not skip_edges: 

1490 for _, _, state in xgraph.edges(data=True): 

1491 edge: Edge | None = state.pop("instance", None) 

1492 if edge is not None: 

1493 state.update(edge._to_xgraph_state()) 

1494 return xgraph 

1495 

1496 def _replace_task_nodes( 

1497 self, 

1498 updates: Mapping[str, TaskNode], 

1499 check_edges_unchanged: bool, 

1500 assume_edges_unchanged: bool, 

1501 message_header: str, 

1502 ) -> None: 

1503 """Replace task nodes and update edges and dataset type nodes 

1504 accordingly. 

1505 

1506 Parameters 

1507 ---------- 

1508 updates : `Mapping` [ `str`, `TaskNode` ] 

1509 New task nodes with task label keys. All keys must be task labels 

1510 that are already present in the graph. 

1511 check_edges_unchanged : `bool`, optional 

1512 If `True`, require the edges (connections) of the modified tasks to 

1513 remain unchanged after importing and configuring each task, and 

1514 verify that this is the case. 

1515 assume_edges_unchanged : `bool`, optional 

1516 If `True`, the caller declares that the edges (connections) of the 

1517 modified tasks will remain unchanged importing and configuring each 

1518 task, and that it is unnecessary to check this. 

1519 message_header : `str` 

1520 Template for `str.format` with a single ``task_label`` placeholder 

1521 to use as the first line in `EdgesChangedError` messages that show 

1522 the differences between new task edges and old task edges. Should 

1523 include the fact that the rest of the message will refer to the old 

1524 task as "A" and the new task as "B", and end with a colon. 

1525 

1526 Raises 

1527 ------ 

1528 ValueError 

1529 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

1530 are both `True`, or if a full config is provided for a task after 

1531 another full config or an override has already been provided. 

1532 EdgesChangedError 

1533 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

1534 change. 

1535 """ 

1536 deep: dict[str, TaskNode] = {} 

1537 shallow: dict[str, TaskNode] = {} 

1538 if assume_edges_unchanged: 

1539 if check_edges_unchanged: 

1540 raise ValueError("Cannot simultaneously assume and check that edges have not changed.") 

1541 shallow.update(updates) 

1542 else: 

1543 for task_label, new_task_node in updates.items(): 

1544 old_task_node = self.tasks[task_label] 

1545 messages = old_task_node.diff_edges(new_task_node) 

1546 if messages: 

1547 if check_edges_unchanged: 

1548 messages.insert(0, message_header.format(task_label=task_label)) 

1549 raise EdgesChangedError("\n".join(messages)) 

1550 else: 

1551 deep[task_label] = new_task_node 

1552 else: 

1553 shallow[task_label] = new_task_node 

1554 try: 

1555 if deep: 

1556 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True) 

1557 self.add_task_nodes(deep.values()) 

1558 for replaced_task_node, referencing_subsets in removed: 

1559 for subset_label in referencing_subsets: 

1560 self._task_subsets[subset_label].add(replaced_task_node.label) 

1561 for task_node in shallow.values(): 

1562 self._xgraph.nodes[task_node.key]["instance"] = task_node 

1563 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init 

1564 except PipelineGraphExceptionSafetyError: # pragma: no cover 

1565 raise 

1566 except Exception as err: # pragma: no cover 

1567 # There's no known way to get here, but we want to make it clear 

1568 # it's a big problem if we do. 

1569 raise PipelineGraphExceptionSafetyError( 

1570 "Error while replacing tasks has left the graph in an inconsistent state." 

1571 ) from err 

1572 

1573 def _append_graph_data_from_edge( 

1574 self, 

1575 node_data: list[tuple[NodeKey, dict[str, Any]]], 

1576 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]], 

1577 edge: Edge, 

1578 ) -> None: 

1579 """Append networkx state dictionaries for an edge and the corresponding 

1580 dataset type node. 

1581 

1582 Parameters 

1583 ---------- 

1584 node_data : `list` 

1585 List of node keys and state dictionaries. A node is appended if 

1586 one does not already exist for this dataset type. 

1587 edge_data : `list` 

1588 List of node key pairs, connection names, and state dictionaries 

1589 for edges. 

1590 edge : `Edge` 

1591 New edge being processed. 

1592 """ 

1593 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None: 

1594 existing_dataset_type_state["instance"] = None 

1595 else: 

1596 node_data.append( 

1597 ( 

1598 edge.dataset_type_key, 

1599 { 

1600 "instance": None, 

1601 "bipartite": NodeType.DATASET_TYPE.bipartite, 

1602 }, 

1603 ) 

1604 ) 

1605 edge_data.append( 

1606 edge.nodes 

1607 + ( 

1608 edge.connection_name, 

1609 {"instance": edge}, 

1610 ) 

1611 ) 

1612 

1613 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None: 

1614 """Set the order of all views of this graph from the given sorted 

1615 sequence of task labels and dataset type names. 

1616 """ 

1617 self._sorted_keys = sorted_keys 

1618 self._tasks._reorder(sorted_keys) 

1619 self._dataset_types._reorder(sorted_keys) 

1620 

1621 def _reset(self) -> None: 

1622 """Reset the all views of this graph following a modification that 

1623 might invalidate them. 

1624 """ 

1625 self._sorted_keys = None 

1626 self._tasks._reset() 

1627 self._dataset_types._reset() 

1628 

1629 _xgraph: networkx.MultiDiGraph 

1630 _sorted_keys: Sequence[NodeKey] | None 

1631 _task_subsets: dict[str, TaskSubset] 

1632 _description: str 

1633 _tasks: TaskMappingView 

1634 _dataset_types: DatasetTypeMappingView 

1635 _raw_data_id: dict[str, Any] 

1636 _universe: DimensionUniverse | None