Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 20%

377 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-19 10:39 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("PipelineGraph",) 

30 

31import gzip 

32import itertools 

33import json 

34from collections.abc import Iterable, Iterator, Mapping, Sequence 

35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast 

36 

37import networkx 

38import networkx.algorithms.bipartite 

39import networkx.algorithms.dag 

40from lsst.daf.butler import DataCoordinate, DataId, DimensionGraph, DimensionUniverse, Registry 

41from lsst.resources import ResourcePath, ResourcePathExpression 

42 

43from ._dataset_types import DatasetTypeNode 

44from ._edges import Edge, ReadEdge, WriteEdge 

45from ._exceptions import ( 

46 DuplicateOutputError, 

47 EdgesChangedError, 

48 PipelineDataCycleError, 

49 PipelineGraphError, 

50 PipelineGraphExceptionSafetyError, 

51 UnresolvedGraphError, 

52) 

53from ._mapping_views import DatasetTypeMappingView, TaskMappingView 

54from ._nodes import NodeKey, NodeType 

55from ._task_subsets import TaskSubset 

56from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData 

57 

58if TYPE_CHECKING: 

59 from ..config import PipelineTaskConfig 

60 from ..connections import PipelineTaskConnections 

61 from ..pipeline import TaskDef 

62 from ..pipelineTask import PipelineTask 

63 

64 

65_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph) 

66 

67 

68class PipelineGraph: 

69 """A graph representation of fully-configured pipeline. 

70 

71 `PipelineGraph` instances are typically constructed by calling 

72 `.Pipeline.to_graph`, but in rare cases constructing and then populating an 

73 empty one may be preferable. 

74 

75 Parameters 

76 ---------- 

77 description : `str`, optional 

78 String description for this pipeline. 

79 universe : `lsst.daf.butler.DimensionUniverse`, optional 

80 Definitions for all butler dimensions. If not provided, some 

81 attributes will not be available until `resolve` is called. 

82 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional 

83 Data ID that represents a constraint on all quanta generated by this 

84 pipeline. This typically just holds the instrument constraint included 

85 in the pipeline definition, if there was one. 

86 """ 

87 

88 ########################################################################### 

89 # 

90 # Simple Pipeline Graph Inspection Interface: 

91 # 

92 # - for inspecting graph structure, not modifying it (except to sort and] 

93 # resolve); 

94 # 

95 # - no NodeKey objects, just string dataset type name and task label keys; 

96 # 

97 # - graph structure is represented as a pair of mappings, with methods to 

98 # find neighbors and edges of nodes. 

99 # 

100 ########################################################################### 

101 

102 def __init__( 

103 self, 

104 *, 

105 description: str = "", 

106 universe: DimensionUniverse | None = None, 

107 data_id: DataId | None = None, 

108 ) -> None: 

109 self._init_from_args( 

110 xgraph=None, 

111 sorted_keys=None, 

112 task_subsets=None, 

113 description=description, 

114 universe=universe, 

115 data_id=data_id, 

116 ) 

117 

118 def __repr__(self) -> str: 

119 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})" 

120 

121 @property 

122 def description(self) -> str: 

123 """String description for this pipeline.""" 

124 return self._description 

125 

126 @description.setter 

127 def description(self, value: str) -> None: 

128 # Docstring in setter. 

129 self._description = value 

130 

131 @property 

132 def universe(self) -> DimensionUniverse | None: 

133 """Definitions for all butler dimensions.""" 

134 return self._universe 

135 

136 @property 

137 def data_id(self) -> DataCoordinate: 

138 """Data ID that represents a constraint on all quanta generated from 

139 this pipeline. 

140 

141 This is may not be available unless `universe` is not `None`. 

142 """ 

143 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe) 

144 

145 @property 

146 def tasks(self) -> TaskMappingView: 

147 """A mapping view of the tasks in the graph. 

148 

149 This mapping has `str` task label keys and `TaskNode` values. Iteration 

150 is topologically and deterministically ordered if and only if `sort` 

151 has been called since the last modification to the graph. 

152 """ 

153 return self._tasks 

154 

155 @property 

156 def dataset_types(self) -> DatasetTypeMappingView: 

157 """A mapping view of the dataset types in the graph. 

158 

159 This mapping has `str` parent dataset type name keys, but only provides 

160 access to its `DatasetTypeNode` values if `resolve` has been called 

161 since the last modification involving a task that uses a dataset type. 

162 See `DatasetTypeMappingView` for details. 

163 """ 

164 return self._dataset_types 

165 

166 @property 

167 def task_subsets(self) -> Mapping[str, TaskSubset]: 

168 """A mapping of all labeled subsets of tasks. 

169 

170 Keys are subset labels, values are sets of task labels. See 

171 `TaskSubset` for more information. 

172 

173 Use `add_task_subset` to add a new subset. The subsets themselves may 

174 be modified in-place. 

175 """ 

176 return self._task_subsets 

177 

178 @property 

179 def is_fully_resolved(self) -> bool: 

180 """Whether all of this graph's nodes are resolved.""" 

181 return self._universe is not None and all( 

182 self.dataset_types.is_resolved(k) for k in self.dataset_types 

183 ) 

184 

185 @property 

186 def is_sorted(self) -> bool: 

187 """Whether this graph's tasks and dataset types are topologically 

188 sorted with the exact same deterministic tiebreakers that `sort` would 

189 apply. 

190 

191 This may perform (and then discard) a full sort if `has_been_sorted` is 

192 `False`. If the goal is to obtain a sorted graph, it is better to just 

193 call `sort` without guarding that with an ``if not graph.is_sorted`` 

194 check. 

195 """ 

196 if self._sorted_keys is not None: 

197 return True 

198 return all( 

199 sorted == unsorted 

200 for sorted, unsorted in zip( 

201 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True 

202 ) 

203 ) 

204 

205 @property 

206 def has_been_sorted(self) -> bool: 

207 """Whether this graph's tasks and dataset types have been 

208 topologically sorted (with unspecified but deterministic tiebreakers) 

209 since the last modification to the graph. 

210 

211 This may return `False` if the graph *happens* to be sorted but `sort` 

212 was never called, but it is potentially much faster than `is_sorted`, 

213 which may attempt (and then discard) a full sort if `has_been_sorted` 

214 is `False`. 

215 """ 

216 return self._sorted_keys is not None 

217 

218 def sort(self) -> None: 

219 """Sort this graph's nodes topologically with deterministic (but 

220 unspecified) tiebreakers. 

221 

222 This does nothing if the graph is already known to be sorted. 

223 """ 

224 if self._sorted_keys is None: 

225 try: 

226 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph)) 

227 except networkx.NetworkXUnfeasible as err: # pragma: no cover 

228 # Should't be possible to get here, because we check for cycles 

229 # when adding tasks, but we guard against it anyway. 

230 cycle = networkx.find_cycle(self._xgraph) 

231 raise PipelineDataCycleError( 

232 f"Cycle detected while attempting to sort graph: {cycle}." 

233 ) from err 

234 self._reorder(sorted_keys) 

235 

236 def copy(self) -> PipelineGraph: 

237 """Return a copy of this graph that copies all mutable state.""" 

238 xgraph = self._xgraph.copy() 

239 result = PipelineGraph.__new__(PipelineGraph) 

240 result._init_from_args( 

241 xgraph, 

242 self._sorted_keys, 

243 task_subsets={ 

244 k: TaskSubset(xgraph, v.label, set(v._members), v.description) 

245 for k, v in self._task_subsets.items() 

246 }, 

247 description=self._description, 

248 universe=self.universe, 

249 data_id=self._raw_data_id, 

250 ) 

251 return result 

252 

253 def __copy__(self) -> PipelineGraph: 

254 # Fully shallow copies are dangerous; we don't want shared mutable 

255 # state to lead to broken class invariants. 

256 return self.copy() 

257 

258 def __deepcopy__(self, memo: dict) -> PipelineGraph: 

259 # Genuine deep copies are unnecessary, since we should only ever care 

260 # that mutable state is copied. 

261 return self.copy() 

262 

263 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None: 

264 """Return the `WriteEdge` that links the producing task to the named 

265 dataset type. 

266 

267 Parameters 

268 ---------- 

269 dataset_type_name : `str` 

270 Dataset type name. Must not be a component. 

271 

272 Returns 

273 ------- 

274 edge : `WriteEdge` or `None` 

275 Producing edge or `None` if there isn't one in this graph. 

276 

277 Raises 

278 ------ 

279 DuplicateOutputError 

280 Raised if there are multiple tasks defined to produce this dataset 

281 type. This is only possible if the graph's dataset types are not 

282 resolved. 

283 

284 Notes 

285 ----- 

286 On resolved graphs, it may be slightly more efficient to use:: 

287 

288 graph.dataset_types[dataset_type_name].producing_edge 

289 

290 but this method works on graphs with unresolved dataset types as well. 

291 """ 

292 producer: str | None = None 

293 producing_edge: WriteEdge | None = None 

294 for _, _, producing_edge in self._xgraph.in_edges( 

295 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

296 ): 

297 assert producing_edge is not None, "Should only be None if we never loop." 

298 if producer is not None: 

299 raise DuplicateOutputError( 

300 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} " 

301 f"and {producer!r}." 

302 ) 

303 return producing_edge 

304 

305 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]: 

306 """Return the `ReadEdge` objects that link the named dataset type to 

307 the tasks that consume it. 

308 

309 Parameters 

310 ---------- 

311 dataset_type_name : `str` 

312 Dataset type name. Must not be a component. 

313 

314 Returns 

315 ------- 

316 edges : `list` [ `ReadEdge` ] 

317 Edges that connect this dataset type to the tasks that consume it. 

318 

319 Notes 

320 ----- 

321 On resolved graphs, it may be slightly more efficient to use:: 

322 

323 graph.dataset_types[dataset_type_name].producing_edges 

324 

325 but this method works on graphs with unresolved dataset types as well. 

326 """ 

327 return [ 

328 edge 

329 for _, _, edge in self._xgraph.out_edges( 

330 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

331 ) 

332 ] 

333 

334 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None: 

335 """Return the `TaskNode` or `TaskInitNode` that writes the given 

336 dataset type. 

337 

338 Parameters 

339 ---------- 

340 dataset_type_name : `str` 

341 Dataset type name. Must not be a component. 

342 

343 Returns 

344 ------- 

345 edge : `TaskNode`, `TaskInitNode`, or `None` 

346 Producing node or `None` if there isn't one in this graph. 

347 

348 Raises 

349 ------ 

350 DuplicateOutputError 

351 Raised if there are multiple tasks defined to produce this dataset 

352 type. This is only possible if the graph's dataset types are not 

353 resolved. 

354 """ 

355 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None: 

356 return self._xgraph.nodes[producing_edge.task_key]["instance"] 

357 return None 

358 

359 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]: 

360 """Return the `TaskNode` and/or `TaskInitNode` objects that read 

361 the given dataset type. 

362 

363 Parameters 

364 ---------- 

365 dataset_type_name : `str` 

366 Dataset type name. Must not be a component. 

367 

368 Returns 

369 ------- 

370 edges : `list` [ `ReadEdge` ] 

371 Edges that connect this dataset type to the tasks that consume it. 

372 

373 Notes 

374 ----- 

375 On resolved graphs, it may be slightly more efficient to use:: 

376 

377 graph.dataset_types[dataset_type_name].producing_edges 

378 

379 but this method works on graphs with unresolved dataset types as well. 

380 """ 

381 return [ 

382 self._xgraph.nodes[consuming_edge.task_key]["instance"] 

383 for consuming_edge in self.consuming_edges_of(dataset_type_name) 

384 ] 

385 

386 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]: 

387 """Return the dataset types that are inputs to a task. 

388 

389 Parameters 

390 ---------- 

391 task_label : `str` 

392 Label for the task in the pipeline. 

393 init : `bool`, optional 

394 If `True`, return init-input dataset types instead of runtime 

395 (including prerequisite) inputs. 

396 

397 Returns 

398 ------- 

399 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

400 Dictionary parent dataset type name keys and either 

401 `DatasetTypeNode` values (if the dataset type has been resolved) 

402 or `None` values. 

403 

404 Notes 

405 ----- 

406 To get the input edges of a task or task init node (which provide 

407 information about storage class overrides nd components) use:: 

408 

409 graph.tasks[task_label].iter_all_inputs() 

410 

411 or 

412 

413 graph.tasks[task_label].init.iter_all_inputs() 

414 

415 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

416 class. 

417 """ 

418 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

419 return { 

420 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

421 for edge in node.iter_all_inputs() 

422 } 

423 

424 def outputs_of( 

425 self, task_label: str, init: bool = False, include_automatic_connections: bool = True 

426 ) -> dict[str, DatasetTypeNode | None]: 

427 """Return the dataset types that are outputs of a task. 

428 

429 Parameters 

430 ---------- 

431 task_label : `str` 

432 Label for the task in the pipeline. 

433 init : `bool`, optional 

434 If `True`, return init-output dataset types instead of runtime 

435 outputs. 

436 include_automatic_connections : `bool`, optional 

437 Whether to include automatic connections such as configs, metadata, 

438 and logs. 

439 

440 Returns 

441 ------- 

442 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

443 Dictionary parent dataset type name keys and either 

444 `DatasetTypeNode` values (if the dataset type has been resolved) 

445 or `None` values. 

446 

447 Notes 

448 ----- 

449 To get the input edges of a task or task init node (which provide 

450 information about storage class overrides nd components) use:: 

451 

452 graph.tasks[task_label].iter_all_outputs() 

453 

454 or 

455 

456 graph.tasks[task_label].init.iter_all_outputs() 

457 

458 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

459 class. 

460 """ 

461 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

462 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values() 

463 return { 

464 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

465 for edge in iterable 

466 } 

467 

468 def resolve(self, registry: Registry) -> None: 

469 """Resolve all dimensions and dataset types and check them for 

470 consistency. 

471 

472 Resolving a graph also causes it to be sorted. 

473 

474 Parameters 

475 ---------- 

476 registry : `lsst.daf.butler.Registry` 

477 Client for the data repository to resolve against. 

478 

479 Notes 

480 ----- 

481 The `universe` attribute is set to ``registry.dimensions`` and used to 

482 set all `TaskNode.dimensions` attributes. Dataset type nodes are 

483 resolved by first looking for a registry definition, then using the 

484 producing task's definition, then looking for consistency between all 

485 consuming task definitions. 

486 

487 Raises 

488 ------ 

489 ConnectionTypeConsistencyError 

490 Raised if a prerequisite input for one task appears as a different 

491 kind of connection in any other task. 

492 DuplicateOutputError 

493 Raised if multiple tasks have the same dataset type as an output. 

494 IncompatibleDatasetTypeError 

495 Raised if different tasks have different definitions of a dataset 

496 type. Different but compatible storage classes are permitted. 

497 MissingDatasetTypeError 

498 Raised if a dataset type definition is required to exist in the 

499 data repository but none was found. This should only occur for 

500 dataset types that are not produced by a task in the pipeline and 

501 are consumed with different storage classes or as components by 

502 tasks in the pipeline. 

503 EdgesChangedError 

504 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

505 change after import and reconfiguration. 

506 """ 

507 node_key: NodeKey 

508 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {} 

509 for node_key, node_state in self._xgraph.nodes.items(): 

510 match node_key.node_type: 

511 case NodeType.TASK: 

512 task_node: TaskNode = node_state["instance"] 

513 new_task_node = task_node._resolved(registry.dimensions) 

514 if new_task_node is not task_node: 

515 updates[node_key] = new_task_node 

516 case NodeType.DATASET_TYPE: 

517 dataset_type_node: DatasetTypeNode | None = node_state["instance"] 

518 new_dataset_type_node = DatasetTypeNode._from_edges( 

519 node_key, self._xgraph, registry, previous=dataset_type_node 

520 ) 

521 # Usage of `is`` here is intentional; `_from_edges` returns 

522 # `previous=dataset_type_node` if it can determine that it 

523 # doesn't need to change. 

524 if new_dataset_type_node is not dataset_type_node: 

525 updates[node_key] = new_dataset_type_node 

526 try: 

527 for node_key, node_value in updates.items(): 

528 self._xgraph.nodes[node_key]["instance"] = node_value 

529 except Exception as err: # pragma: no cover 

530 # There's no known way to get here, but we want to make it 

531 # clear it's a big problem if we do. 

532 raise PipelineGraphExceptionSafetyError( 

533 "Error during dataset type resolution has left the graph in an inconsistent state." 

534 ) from err 

535 self.sort() 

536 self._universe = registry.dimensions 

537 

538 ########################################################################### 

539 # 

540 # Graph Modification Interface: 

541 # 

542 # - methods to add, remove, and replace tasks; 

543 # 

544 # - methods to add and remove task subsets. 

545 # 

546 # These are all things that are usually done in a Pipeline before making a 

547 # graph at all, but there may be cases where we want to modify the graph 

548 # instead. (These are also the methods used to make a graph from a 

549 # Pipeline, or make a graph from another graph.) 

550 # 

551 ########################################################################### 

552 

553 def add_task( 

554 self, 

555 label: str, 

556 task_class: type[PipelineTask], 

557 config: PipelineTaskConfig, 

558 connections: PipelineTaskConnections | None = None, 

559 ) -> TaskNode: 

560 """Add a new task to the graph. 

561 

562 Parameters 

563 ---------- 

564 label : `str` 

565 Label for the task in the pipeline. 

566 task_class : `type` [ `PipelineTask` ] 

567 Class object for the task. 

568 config : `PipelineTaskConfig` 

569 Configuration for the task. 

570 connections : `PipelineTaskConnections`, optional 

571 Object that describes the dataset types used by the task. If not 

572 provided, one will be constructed from the given configuration. If 

573 provided, it is assumed that ``config`` has already been validated 

574 and frozen. 

575 

576 Returns 

577 ------- 

578 node : `TaskNode` 

579 The new task node added to the graph. 

580 

581 Raises 

582 ------ 

583 ValueError 

584 Raised if configuration validation failed when constructing 

585 ``connections``. 

586 PipelineDataCycleError 

587 Raised if the graph is cyclic after this addition. 

588 RuntimeError 

589 Raised if an unexpected exception (which will be chained) occurred 

590 at a stage that may have left the graph in an inconsistent state. 

591 Other exceptions should leave the graph unchanged. 

592 

593 Notes 

594 ----- 

595 Checks for dataset type consistency and multiple producers do not occur 

596 until `resolve` is called, since the resolution depends on both the 

597 state of the data repository and all contributing tasks. 

598 

599 Adding new tasks removes any existing resolutions of all dataset types 

600 it references and marks the graph as unsorted. It is most effiecient 

601 to add all tasks up front and only then resolve and/or sort the graph. 

602 """ 

603 task_node = TaskNode._from_imported_data( 

604 key=NodeKey(NodeType.TASK, label), 

605 init_key=NodeKey(NodeType.TASK_INIT, label), 

606 data=_TaskNodeImportedData.configure(label, task_class, config, connections), 

607 universe=self.universe, 

608 ) 

609 self.add_task_nodes([task_node]) 

610 return task_node 

611 

612 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None: 

613 """Add one or more existing task nodes to the graph. 

614 

615 Parameters 

616 ---------- 

617 nodes : `~collections.abc.Iterable` [ `TaskNode` ] 

618 Iterable of task nodes to add. If any tasks have resolved 

619 dimensions, they must have the same dimension universe as the rest 

620 of the graph. 

621 parent : `PipelineGraph`, optional 

622 If provided, another `PipelineGraph` from which these nodes were 

623 obtained. Any dataset type nodes already present in ``parent`` 

624 that are referenced by the given tasks will be used in this graph 

625 if they are not already present, preserving any dataset type 

626 resolutions present in the parent graph. Adding nodes from a 

627 parent graph after the graph has its own nodes (e.g. from 

628 `add_task`) or nodes from a third graph may result in invalid 

629 dataset type resolutions. It is safest to only use this argument 

630 when populating an empty graph for the first time. 

631 

632 Raises 

633 ------ 

634 PipelineDataCycleError 

635 Raised if the graph is cyclic after this addition. 

636 

637 Notes 

638 ----- 

639 Checks for dataset type consistency and multiple producers do not occur 

640 until `resolve` is called, since the resolution depends on both the 

641 state of the data repository and all contributing tasks. 

642 

643 Adding new tasks removes any existing resolutions of all dataset types 

644 it references (unless ``parent is not None`` and marks the graph as 

645 unsorted. It is most efficient to add all tasks up front and only then 

646 resolve and/or sort the graph. 

647 """ 

648 node_data: list[tuple[NodeKey, dict[str, Any]]] = [] 

649 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = [] 

650 for task_node in nodes: 

651 task_node = task_node._resolved(self._universe) 

652 node_data.append( 

653 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite}) 

654 ) 

655 node_data.append( 

656 ( 

657 task_node.init.key, 

658 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite}, 

659 ) 

660 ) 

661 # Convert the edge objects attached to the task node to networkx. 

662 for read_edge in task_node.init.iter_all_inputs(): 

663 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

664 for write_edge in task_node.init.iter_all_outputs(): 

665 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

666 for read_edge in task_node.iter_all_inputs(): 

667 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

668 for write_edge in task_node.iter_all_outputs(): 

669 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

670 # Add a special edge (with no Edge instance) that connects the 

671 # TaskInitNode to the runtime TaskNode. 

672 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None})) 

673 if not node_data and not edge_data: 

674 return 

675 # Checks and preparation complete; time to start the actual 

676 # modification, during which it's hard to provide strong exception 

677 # safety. Start by resetting the sort ordering, if there is one. 

678 self._reset() 

679 try: 

680 self._xgraph.add_nodes_from(node_data) 

681 self._xgraph.add_edges_from(edge_data) 

682 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph): 

683 cycle = networkx.find_cycle(self._xgraph) 

684 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.") 

685 except Exception: 

686 # First try to roll back our changes. 

687 try: 

688 self._xgraph.remove_edges_from(edge_data) 

689 self._xgraph.remove_nodes_from(key for key, _ in node_data) 

690 except Exception as err: # pragma: no cover 

691 # There's no known way to get here, but we want to make it 

692 # clear it's a big problem if we do. 

693 raise PipelineGraphExceptionSafetyError( 

694 "Error while attempting to revert PipelineGraph modification has left the graph in " 

695 "an inconsistent state." 

696 ) from err 

697 # Successfully rolled back; raise the original exception. 

698 raise 

699 

700 def reconfigure_tasks( 

701 self, 

702 *args: tuple[str, PipelineTaskConfig], 

703 check_edges_unchanged: bool = False, 

704 assume_edges_unchanged: bool = False, 

705 **kwargs: PipelineTaskConfig, 

706 ) -> None: 

707 """Update the configuration for one or more tasks. 

708 

709 Parameters 

710 ---------- 

711 *args : `tuple` [ `str`, `.PipelineTaskConfig` ] 

712 Positional arguments are each a 2-tuple of task label and new 

713 config object. Note that the same arguments may also be passed as 

714 ``**kwargs``, which is usually more readable, but task labels in 

715 ``*args`` are not required to be valid Python identifiers. 

716 check_edges_unchanged : `bool`, optional 

717 If `True`, require the edges (connections) of the modified tasks to 

718 remain unchanged after the configuration updates, and verify that 

719 this is the case. 

720 assume_edges_unchanged : `bool`, optional 

721 If `True`, the caller declares that the edges (connections) of the 

722 modified tasks will remain unchanged after the configuration 

723 updates, and that it is unnecessary to check this. 

724 **kwargs : `.PipelineTaskConfig` 

725 New config objects or overrides to apply to copies of the current 

726 config objects, with task labels as the keywords. 

727 

728 Raises 

729 ------ 

730 ValueError 

731 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

732 are both `True`, or if the same task appears twice. 

733 EdgesChangedError 

734 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

735 change. 

736 

737 Notes 

738 ----- 

739 If reconfiguring a task causes its edges to change, any dataset type 

740 nodes connected to that task (not just those whose edges have changed!) 

741 will be unresolved. 

742 """ 

743 new_configs: dict[str, PipelineTaskConfig] = {} 

744 for task_label, config_update in itertools.chain(args, kwargs.items()): 

745 if new_configs.setdefault(task_label, config_update) is not config_update: 

746 raise ValueError(f"Config for {task_label!r} provided more than once.") 

747 updates = { 

748 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged) 

749 for task_label, config in new_configs.items() 

750 } 

751 self._replace_task_nodes( 

752 updates, 

753 check_edges_unchanged=check_edges_unchanged, 

754 assume_edges_unchanged=assume_edges_unchanged, 

755 message_header=( 

756 "Unexpected change in edges for task {task_label!r} from original config (A) to " 

757 "new configs (B):" 

758 ), 

759 ) 

760 

761 def remove_tasks( 

762 self, labels: Iterable[str], drop_from_subsets: bool = True 

763 ) -> list[tuple[TaskNode, set[str]]]: 

764 """Remove one or more tasks from the graph. 

765 

766 Parameters 

767 ---------- 

768 labels : `~collections.abc.Iterable` [ `str` ] 

769 Iterable of the labels of the tasks to remove. 

770 drop_from_subsets : `bool`, optional 

771 If `True`, drop each removed task from any subset in which it 

772 currently appears. If `False`, raise `PipelineGraphError` if any 

773 such subsets exist. 

774 

775 Returns 

776 ------- 

777 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ] 

778 List of nodes removed and the labels of task subsets that 

779 referenced them. 

780 

781 Raises 

782 ------ 

783 PipelineGraphError 

784 Raised if ``drop_from_subsets`` is `False` and the task is still 

785 part of one or more subsets. 

786 

787 Notes 

788 ----- 

789 Removing a task will cause dataset nodes with no other referencing 

790 tasks to be removed. Any other dataset type nodes referenced by a 

791 removed task will be reset to an "unresolved" state. 

792 """ 

793 task_nodes_and_subsets = [] 

794 dataset_types: set[NodeKey] = set() 

795 nodes_to_remove = set() 

796 for label in labels: 

797 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"] 

798 # Find task subsets that reference this task. 

799 referencing_subsets = { 

800 subset_label 

801 for subset_label, task_subset in self.task_subsets.items() 

802 if label in task_subset 

803 } 

804 if not drop_from_subsets and referencing_subsets: 

805 raise PipelineGraphError( 

806 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}." 

807 ) 

808 task_nodes_and_subsets.append((task_node, referencing_subsets)) 

809 # Find dataset types referenced by this task. 

810 dataset_types.update(self._xgraph.predecessors(task_node.key)) 

811 dataset_types.update(self._xgraph.successors(task_node.key)) 

812 dataset_types.update(self._xgraph.predecessors(task_node.init.key)) 

813 dataset_types.update(self._xgraph.successors(task_node.init.key)) 

814 # Since there's an edge between the task and its init node, we'll 

815 # have added those two nodes here, too, and we don't want that. 

816 dataset_types.remove(task_node.init.key) 

817 dataset_types.remove(task_node.key) 

818 # Mark the task node and its init node for removal from the graph. 

819 nodes_to_remove.add(task_node.key) 

820 nodes_to_remove.add(task_node.init.key) 

821 # Process the referenced datasets to see which ones are orphaned and 

822 # need to be removed vs. just unresolved. 

823 nodes_to_unresolve = [] 

824 for dataset_type_key in dataset_types: 

825 related_tasks = set() 

826 related_tasks.update(self._xgraph.predecessors(dataset_type_key)) 

827 related_tasks.update(self._xgraph.successors(dataset_type_key)) 

828 related_tasks.difference_update(nodes_to_remove) 

829 if not related_tasks: 

830 nodes_to_remove.add(dataset_type_key) 

831 else: 

832 nodes_to_unresolve.append(dataset_type_key) 

833 # Checks and preparation complete; time to start the actual 

834 # modification, during which it's hard to provide strong exception 

835 # safety. Start by resetting the sort ordering. 

836 self._reset() 

837 try: 

838 for dataset_type_key in nodes_to_unresolve: 

839 self._xgraph.nodes[dataset_type_key]["instance"] = None 

840 for task_node, referencing_subsets in task_nodes_and_subsets: 

841 for subset_label in referencing_subsets: 

842 self._task_subsets[subset_label].remove(task_node.label) 

843 self._xgraph.remove_nodes_from(nodes_to_remove) 

844 except Exception as err: # pragma: no cover 

845 # There's no known way to get here, but we want to make it 

846 # clear it's a big problem if we do. 

847 raise PipelineGraphExceptionSafetyError( 

848 "Error during task removal has left the graph in an inconsistent state." 

849 ) from err 

850 return task_nodes_and_subsets 

851 

852 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None: 

853 """Add a label for a set of tasks that are already in the pipeline. 

854 

855 Parameters 

856 ---------- 

857 subset_label : `str` 

858 Label for this set of tasks. 

859 task_labels : `~collections.abc.Iterable` [ `str` ] 

860 Labels of the tasks to include in the set. All must already be 

861 included in the graph. 

862 description : `str`, optional 

863 String description to associate with this label. 

864 """ 

865 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description) 

866 self._task_subsets[subset_label] = subset 

867 

868 def remove_task_subset(self, subset_label: str) -> None: 

869 """Remove a labeled set of tasks.""" 

870 del self._task_subsets[subset_label] 

871 

872 ########################################################################### 

873 # 

874 # NetworkX Export Interface: 

875 # 

876 # - methods to export the PipelineGraph's content (or various subsets 

877 # thereof) as NetworkX objects. 

878 # 

879 # These are particularly useful when writing tools to visualize the graph, 

880 # while providing options for which aspects of the graph (tasks, dataset 

881 # types, or both) to include, since all exported graphs have similar 

882 # attributes regardless of their structure. 

883 # 

884 ########################################################################### 

885 

886 def make_xgraph(self) -> networkx.MultiDiGraph: 

887 """Export a networkx representation of the full pipeline graph, 

888 including both init and runtime edges. 

889 

890 Returns 

891 ------- 

892 xgraph : `networkx.MultiDiGraph` 

893 Directed acyclic graph with parallel edges. 

894 

895 Notes 

896 ----- 

897 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

898 represent the same dataset type appearing in multiple connections for 

899 the same task, and are hence rare. The connection name is used as the 

900 edge key to disambiguate those parallel edges. 

901 

902 Almost all edges connect dataset type nodes to task or task init nodes 

903 or vice versa, but there is also a special edge that connects each task 

904 init node to its runtime node. The existence of these edges makes the 

905 graph not quite bipartite, though its init-only and runtime-only 

906 subgraphs are bipartite. 

907 

908 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

909 `WriteEdge` for the descriptive node and edge attributes added. 

910 """ 

911 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False) 

912 

913 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph: 

914 """Return a bipartite networkx representation of just the runtime or 

915 init-time pipeline graph. 

916 

917 Parameters 

918 ---------- 

919 init : `bool`, optional 

920 If `True` (`False` is default) return the graph of task 

921 initialization nodes and init input/output dataset types, instead 

922 of the graph of runtime task nodes and regular 

923 input/output/prerequisite dataset types. 

924 

925 Returns 

926 ------- 

927 xgraph : `networkx.MultiDiGraph` 

928 Directed acyclic graph with parallel edges. 

929 

930 Notes 

931 ----- 

932 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

933 represent the same dataset type appearing in multiple connections for 

934 the same task, and are hence rare. The connection name is used as the 

935 edge key to disambiguate those parallel edges. 

936 

937 This graph is bipartite because each dataset type node only has edges 

938 that connect it to a task [init] node, and vice versa. 

939 

940 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

941 `WriteEdge` for the descriptive node and edge attributes added. 

942 """ 

943 return self._transform_xgraph_state( 

944 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False 

945 ) 

946 

947 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph: 

948 """Return a networkx representation of just the tasks in the pipeline. 

949 

950 Parameters 

951 ---------- 

952 init : `bool`, optional 

953 If `True` (`False` is default) return the graph of task 

954 initialization nodes, instead of the graph of runtime task nodes. 

955 

956 Returns 

957 ------- 

958 xgraph : `networkx.DiGraph` 

959 Directed acyclic graph with no parallel edges. 

960 

961 Notes 

962 ----- 

963 The returned graph uses `NodeKey` instances for nodes. The dataset 

964 types that link these tasks are not represented at all; edges have no 

965 attributes, and there are no parallel edges. 

966 

967 See `TaskNode` and `TaskInitNode` for the descriptive node and 

968 attributes added. 

969 """ 

970 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

971 task_keys = [ 

972 key 

973 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

974 if bipartite == NodeType.TASK.bipartite 

975 ] 

976 return self._transform_xgraph_state( 

977 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys), 

978 skip_edges=True, 

979 ) 

980 

981 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph: 

982 """Return a networkx representation of just the dataset types in the 

983 pipeline. 

984 

985 Parameters 

986 ---------- 

987 init : `bool`, optional 

988 If `True` (`False` is default) return the graph of init input and 

989 output dataset types, instead of the graph of runtime (input, 

990 output, prerequisite input) dataset types. 

991 

992 Returns 

993 ------- 

994 xgraph : `networkx.DiGraph` 

995 Directed acyclic graph with no parallel edges. 

996 

997 Notes 

998 ----- 

999 The returned graph uses `NodeKey` instances for nodes. The tasks that 

1000 link these tasks are not represented at all; edges have no attributes, 

1001 and there are no parallel edges. 

1002 

1003 See `DatasetTypeNode` for the descriptive node and attributes added. 

1004 """ 

1005 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

1006 dataset_type_keys = [ 

1007 key 

1008 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1009 if bipartite == NodeType.DATASET_TYPE.bipartite 

1010 ] 

1011 return self._transform_xgraph_state( 

1012 networkx.algorithms.bipartite.projected_graph( 

1013 networkx.DiGraph(bipartite_xgraph), dataset_type_keys 

1014 ), 

1015 skip_edges=True, 

1016 ) 

1017 

1018 ########################################################################### 

1019 # 

1020 # Serialization Interface. 

1021 # 

1022 # Serialization of PipelineGraphs is currently experimental and may not be 

1023 # retained in the future. All serialization methods are 

1024 # underscore-prefixed to ensure nobody mistakes them for a stable interface 

1025 # (let a lone a stable file format). 

1026 # 

1027 ########################################################################### 

1028 

1029 @classmethod 

1030 def _read_stream( 

1031 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1032 ) -> PipelineGraph: 

1033 """Read a serialized `PipelineGraph` from a file-like object. 

1034 

1035 Parameters 

1036 ---------- 

1037 stream : `BinaryIO` 

1038 File-like object opened for binary reading, containing 

1039 gzip-compressed JSON. 

1040 import_mode : `TaskImportMode`, optional 

1041 Whether to import tasks, and how to reconcile any differences 

1042 between the imported task's connections and the those that were 

1043 persisted with the graph. Default is to check that they are the 

1044 same. 

1045 

1046 Returns 

1047 ------- 

1048 graph : `PipelineGraph` 

1049 Deserialized pipeline graph. 

1050 

1051 Raises 

1052 ------ 

1053 PipelineGraphReadError 

1054 Raised if the serialized `PipelineGraph` is not self-consistent. 

1055 EdgesChangedError 

1056 Raised if ``import_mode`` is 

1057 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1058 did change after import and reconfiguration. 

1059 

1060 Notes 

1061 ----- 

1062 `PipelineGraph` serialization is currently experimental and may be 

1063 removed or significantly changed in the future, with no deprecation 

1064 period. 

1065 """ 

1066 from .io import SerializedPipelineGraph 

1067 

1068 with gzip.open(stream, "rb") as uncompressed_stream: 

1069 data = json.load(uncompressed_stream) 

1070 serialized_graph = SerializedPipelineGraph.parse_obj(data) 

1071 return serialized_graph.deserialize(import_mode) 

1072 

1073 @classmethod 

1074 def _read_uri( 

1075 cls, 

1076 uri: ResourcePathExpression, 

1077 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES, 

1078 ) -> PipelineGraph: 

1079 """Read a serialized `PipelineGraph` from a file at a URI. 

1080 

1081 Parameters 

1082 ---------- 

1083 uri : convertible to `lsst.resources.ResourcePath` 

1084 URI to a gzip-compressed JSON file containing a serialized pipeline 

1085 graph. 

1086 import_mode : `TaskImportMode`, optional 

1087 Whether to import tasks, and how to reconcile any differences 

1088 between the imported task's connections and the those that were 

1089 persisted with the graph. Default is to check that they are the 

1090 same. 

1091 

1092 Returns 

1093 ------- 

1094 graph : `PipelineGraph` 

1095 Deserialized pipeline graph. 

1096 

1097 Raises 

1098 ------ 

1099 PipelineGraphReadError 

1100 Raised if the serialized `PipelineGraph` is not self-consistent. 

1101 EdgesChangedError 

1102 Raised if ``import_mode`` is 

1103 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1104 did change after import and reconfiguration. 

1105 

1106 Notes 

1107 ----- 

1108 `PipelineGraph` serialization is currently experimental and may be 

1109 removed or significantly changed in the future, with no deprecation 

1110 period. 

1111 """ 

1112 uri = ResourcePath(uri) 

1113 with uri.open("rb") as stream: 

1114 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode) 

1115 

1116 def _write_stream(self, stream: BinaryIO) -> None: 

1117 """Write the pipeline to a file-like object. 

1118 

1119 Parameters 

1120 ---------- 

1121 stream 

1122 File-like object opened for binary writing. 

1123 

1124 Notes 

1125 ----- 

1126 `PipelineGraph` serialization is currently experimental and may be 

1127 removed or significantly changed in the future, with no deprecation 

1128 period. 

1129 

1130 The file format is gzipped JSON, and is intended to be human-readable, 

1131 but it should not be considered a stable public interface for outside 

1132 code, which should always use `PipelineGraph` methods (or at least the 

1133 `io.SerializedPipelineGraph` class) to read these files. 

1134 """ 

1135 from .io import SerializedPipelineGraph 

1136 

1137 with gzip.open(stream, mode="wb") as compressed_stream: 

1138 compressed_stream.write( 

1139 SerializedPipelineGraph.serialize(self).json(exclude_defaults=True).encode("utf-8") 

1140 ) 

1141 

1142 def _write_uri(self, uri: ResourcePathExpression) -> None: 

1143 """Write the pipeline to a file given a URI. 

1144 

1145 Parameters 

1146 ---------- 

1147 uri : convertible to `lsst.resources.ResourcePath` 

1148 URI to write to . May have ``.json.gz`` or no extension (which 

1149 will cause a ``.json.gz`` extension to be added). 

1150 

1151 Notes 

1152 ----- 

1153 `PipelineGraph` serialization is currently experimental and may be 

1154 removed or significantly changed in the future, with no deprecation 

1155 period. 

1156 

1157 The file format is gzipped JSON, and is intended to be human-readable, 

1158 but it should not be considered a stable public interface for outside 

1159 code, which should always use `PipelineGraph` methods (or at least the 

1160 `io.SerializedPipelineGraph` class) to read these files. 

1161 """ 

1162 uri = ResourcePath(uri) 

1163 extension = uri.getExtension() 

1164 if not extension: 

1165 uri = uri.updatedExtension(".json.gz") 

1166 elif extension != ".json.gz": 

1167 raise ValueError("Expanded pipeline files should always have a .json.gz extension.") 

1168 with uri.open(mode="wb") as stream: 

1169 self._write_stream(cast(BinaryIO, stream)) 

1170 

1171 def _import_and_configure( 

1172 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1173 ) -> None: 

1174 """Import the `PipelineTask` classes referenced by all task nodes and 

1175 update those nodes accordingly. 

1176 

1177 Parameters 

1178 ---------- 

1179 import_mode : `TaskImportMode`, optional 

1180 Whether to import tasks, and how to reconcile any differences 

1181 between the imported task's connections and the those that were 

1182 persisted with the graph. Default is to check that they are the 

1183 same. This method does nothing if this is 

1184 `TaskImportMode.DO_NOT_IMPORT`. 

1185 

1186 Raises 

1187 ------ 

1188 EdgesChangedError 

1189 Raised if ``import_mode`` is 

1190 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1191 did change after import and reconfiguration. 

1192 

1193 Notes 

1194 ----- 

1195 This method shouldn't need to be called unless the graph was 

1196 deserialized without importing and configuring immediately, which is 

1197 not the default behavior (but it can greatly speed up deserialization). 

1198 If all tasks have already been imported this does nothing. 

1199 

1200 Importing and configuring a task can change its 

1201 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output, 

1202 usually because the software used to read a serialized graph is newer 

1203 than the software used to write it (e.g. a new config option has been 

1204 added, or the task was moved to a new module with a forwarding alias 

1205 left behind). These changes are allowed by 

1206 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`. 

1207 

1208 If importing and configuring a task causes its edges to change, any 

1209 dataset type nodes linked to those edges will be reset to the 

1210 unresolved state. 

1211 """ 

1212 if import_mode is TaskImportMode.DO_NOT_IMPORT: 

1213 return 

1214 rebuild = ( 

1215 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1216 or import_mode is TaskImportMode.OVERRIDE_EDGES 

1217 ) 

1218 updates: dict[str, TaskNode] = {} 

1219 node_key: NodeKey 

1220 for node_key, node_state in self._xgraph.nodes.items(): 

1221 if node_key.node_type is NodeType.TASK: 

1222 task_node: TaskNode = node_state["instance"] 

1223 new_task_node = task_node._imported_and_configured(rebuild) 

1224 if new_task_node is not task_node: 

1225 updates[task_node.label] = new_task_node 

1226 self._replace_task_nodes( 

1227 updates, 

1228 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES), 

1229 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES), 

1230 message_header=( 

1231 "In task with label {task_label!r}, persisted edges (A)" 

1232 "differ from imported and configured edges (B):" 

1233 ), 

1234 ) 

1235 

1236 ########################################################################### 

1237 # 

1238 # Advanced PipelineGraph Inspection Interface: 

1239 # 

1240 # - methods to iterate over all nodes and edges, utilizing NodeKeys; 

1241 # 

1242 # - methods to find overall inputs and group nodes by their dimensions, 

1243 # which are important operations for QuantumGraph generation. 

1244 # 

1245 ########################################################################### 

1246 

1247 def iter_edges(self, init: bool = False) -> Iterator[Edge]: 

1248 """Iterate over edges in the graph. 

1249 

1250 Parameters 

1251 ---------- 

1252 init : `bool`, optional 

1253 If `True` (`False` is default) iterate over the edges between task 

1254 initialization node and init input/output dataset types, instead of 

1255 the runtime task nodes and regular input/output/prerequisite 

1256 dataset types. 

1257 

1258 Returns 

1259 ------- 

1260 edges : `~collections.abc.Iterator` [ `Edge` ] 

1261 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances. 

1262 

1263 Notes 

1264 ----- 

1265 This method always returns _either_ init edges or runtime edges, never 

1266 both. The full (internal) graph that contains both also includes a 

1267 special edge that connects each task init node to its runtime node; 

1268 that is also never returned by this method, since it is never a part of 

1269 the init-only or runtime-only subgraphs. 

1270 """ 

1271 edge: Edge 

1272 for _, _, edge in self._xgraph.edges(data="instance"): 

1273 if edge is not None and edge.is_init == init: 

1274 yield edge 

1275 

1276 def iter_nodes( 

1277 self, 

1278 ) -> Iterator[ 

1279 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode] 

1280 | tuple[Literal[NodeType.TASK], str, TaskInitNode] 

1281 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None] 

1282 ]: 

1283 """Iterate over nodes in the graph. 

1284 

1285 Returns 

1286 ------- 

1287 nodes : `~collections.abc.Iterator` [ `tuple` ] 

1288 A lazy iterator over all of the nodes in the graph. Each yielded 

1289 element is a tuple of: 

1290 

1291 - the node type enum value (`NodeType`); 

1292 - the string name for the node (task label or parent dataset type 

1293 name); 

1294 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`, 

1295 or `None` for dataset type nodes that have not been resolved). 

1296 """ 

1297 key: NodeKey 

1298 if self._sorted_keys is not None: 

1299 for key in self._sorted_keys: 

1300 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore 

1301 else: 

1302 for key, node in self._xgraph.nodes(data="instance"): 

1303 yield key.node_type, key.name, node # type: ignore 

1304 

1305 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]: 

1306 """Iterate over all of the dataset types that are consumed but not 

1307 produced by the graph. 

1308 

1309 Returns 

1310 ------- 

1311 dataset_types : `~collections.abc.Iterator` [ `tuple` ] 

1312 A lazy iterator over the overall-input dataset types (including 

1313 overall init inputs and prerequisites). Each yielded element is a 

1314 tuple of: 

1315 

1316 - the parent dataset type name; 

1317 - the resolved `DatasetTypeNode`, or `None` if the dataset type has 

1318 - not been resolved. 

1319 """ 

1320 for generation in networkx.algorithms.dag.topological_generations(self._xgraph): 

1321 key: NodeKey 

1322 for key in generation: 

1323 # While we expect all tasks to have at least one input and 

1324 # hence never appear in the first topological generation, that 

1325 # is not true of task init nodes. 

1326 if key.node_type is NodeType.DATASET_TYPE: 

1327 yield key.name, self._xgraph.nodes[key]["instance"] 

1328 return 

1329 

1330 def group_by_dimensions( 

1331 self, prerequisites: bool = False 

1332 ) -> dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]: 

1333 """Group this graph's tasks and dataset types by their dimensions. 

1334 

1335 Parameters 

1336 ---------- 

1337 prerequisites : `bool`, optional 

1338 If `True`, include prerequisite dataset types as well as regular 

1339 input and output datasets (including intermediates). 

1340 

1341 Returns 

1342 ------- 

1343 groups : `dict` [ `DimensionGraph`, `tuple` ] 

1344 A dictionary of groups keyed by `DimensionGraph`, in which each 

1345 value is a tuple of: 

1346 

1347 - a `dict` of `TaskNode` instances, keyed by task label 

1348 - a `dict` of `DatasetTypeNode` instances, keyed by 

1349 dataset type name. 

1350 

1351 that have those dimensions. 

1352 

1353 Notes 

1354 ----- 

1355 Init inputs and outputs are always included, but always have empty 

1356 dimensions and are hence are all grouped together. 

1357 """ 

1358 result: dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {} 

1359 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {}) 

1360 for task_label, task_node in self.tasks.items(): 

1361 if task_node.dimensions is None: 

1362 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.") 

1363 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value: 

1364 next_new_value = ({}, {}) # make new lists for next time 

1365 group[0][task_node.label] = task_node 

1366 for dataset_type_name, dataset_type_node in self.dataset_types.items(): 

1367 if dataset_type_node is None: 

1368 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.") 

1369 if not dataset_type_node.is_prerequisite or prerequisites: 

1370 if ( 

1371 group := result.setdefault(dataset_type_node.dataset_type.dimensions, next_new_value) 

1372 ) is next_new_value: 

1373 next_new_value = ({}, {}) # make new lists for next time 

1374 group[1][dataset_type_node.name] = dataset_type_node 

1375 return result 

1376 

1377 def split_independent(self) -> Iterable[PipelineGraph]: 

1378 """Iterate over independent subgraphs that together comprise this 

1379 pipeline graph. 

1380 

1381 Returns 

1382 ------- 

1383 subgraphs : `Iterable` [ `PipelineGraph` ] 

1384 An iterable over component subgraphs that could be run 

1385 independently (they have only overall inputs in common). May be a 

1386 lazy iterator. 

1387 

1388 Notes 

1389 ----- 

1390 All resolved dataset type nodes will be preserved. 

1391 

1392 If there is only one component, ``self`` may be returned as the only 

1393 element in the iterable. 

1394 

1395 If `has_been_sorted`, all subgraphs will be sorted as well. 

1396 """ 

1397 # Having an overall input in common isn't enough to make subgraphs 

1398 # dependent on each other, so we want to look for connected component 

1399 # subgraphs of the task-only projected graph. 

1400 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False) 

1401 task_keys = { 

1402 key 

1403 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1404 if bipartite == NodeType.TASK.bipartite 

1405 } 

1406 task_xgraph = networkx.algorithms.bipartite.projected_graph( 

1407 networkx.DiGraph(bipartite_xgraph), task_keys 

1408 ) 

1409 # "Weakly" connected means connected in only one direction, which is 

1410 # the only kind of "connected" a DAG can ever be. 

1411 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph): 

1412 if component_task_keys == task_keys: 

1413 yield self 

1414 return 

1415 else: 

1416 component_subgraph = PipelineGraph(universe=self._universe) 

1417 component_subgraph.add_task_nodes( 

1418 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self 

1419 ) 

1420 if self.has_been_sorted: 

1421 component_subgraph.sort() 

1422 yield component_subgraph 

1423 

1424 ########################################################################### 

1425 # 

1426 # Class- and Package-Private Methods. 

1427 # 

1428 ########################################################################### 

1429 

1430 def _iter_task_defs(self) -> Iterator[TaskDef]: 

1431 """Iterate over this pipeline as a sequence of `TaskDef` instances. 

1432 

1433 Notes 

1434 ----- 

1435 This is a package-private method intended to aid in the transition to a 

1436 codebase more fully integrated with the `PipelineGraph` class, in which 

1437 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and 

1438 much of the functionality on the `Pipeline` class will be moved to 

1439 `PipelineGraph` as well. 

1440 

1441 Raises 

1442 ------ 

1443 TaskNotImportedError 

1444 Raised if `TaskNode.is_imported` is `False` for any task. 

1445 """ 

1446 from ..pipeline import TaskDef 

1447 

1448 for node in self._tasks.values(): 

1449 yield TaskDef( 

1450 config=node.config, 

1451 taskClass=node.task_class, 

1452 label=node.label, 

1453 connections=node._get_imported_data().connections, 

1454 ) 

1455 

1456 def _init_from_args( 

1457 self, 

1458 xgraph: networkx.MultiDiGraph | None, 

1459 sorted_keys: Sequence[NodeKey] | None, 

1460 task_subsets: dict[str, TaskSubset] | None, 

1461 description: str, 

1462 universe: DimensionUniverse | None, 

1463 data_id: DataId | None, 

1464 ) -> None: 

1465 """Initialize the graph with possibly-nontrivial arguments. 

1466 

1467 Parameters 

1468 ---------- 

1469 xgraph : `networkx.MultiDiGraph` or `None` 

1470 The backing networkx graph, or `None` to create an empty one. 

1471 This graph has `NodeKey` instances for nodes and the same structure 

1472 as the graph exported by `make_xgraph`, but its nodes and edges 

1473 have a single ``instance`` attribute that holds a `TaskNode`, 

1474 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or 

1475 `WriteEdge` instance. 

1476 sorted_keys : `Sequence` [ `NodeKey` ] or `None` 

1477 Topologically sorted sequence of node keys, or `None` if the graph 

1478 is not sorted. 

1479 task_subsets : `dict` [ `str`, `TaskSubset` ] 

1480 Labeled subsets of tasks. Values must be constructed with 

1481 ``xgraph`` as their parent graph. 

1482 description : `str` 

1483 String description for this pipeline. 

1484 universe : `lsst.daf.butler.DimensionUniverse` or `None` 

1485 Definitions of all dimensions. 

1486 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping. 

1487 Data ID that represents a constraint on all quanta generated from 

1488 this pipeline. 

1489 

1490 Notes 

1491 ----- 

1492 Only empty `PipelineGraph` instances should be constructed directly by 

1493 users, which sets the signature of ``__init__`` itself, but methods on 

1494 `PipelineGraph` and its helper classes need to be able to create them 

1495 with state. Those methods can call this after calling ``__new__`` 

1496 manually, skipping ``__init__``. 

1497 """ 

1498 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph() 

1499 self._sorted_keys: Sequence[NodeKey] | None = None 

1500 self._task_subsets = task_subsets if task_subsets is not None else {} 

1501 self._description = description 

1502 self._tasks = TaskMappingView(self._xgraph) 

1503 self._dataset_types = DatasetTypeMappingView(self._xgraph) 

1504 self._raw_data_id: dict[str, Any] 

1505 if isinstance(data_id, DataCoordinate): 

1506 if universe is None: 

1507 universe = data_id.universe 

1508 else: 

1509 assert universe is data_id.universe, "data_id.universe and given universe differ" 

1510 self._raw_data_id = data_id.byName() 

1511 elif data_id is None: 

1512 self._raw_data_id = {} 

1513 else: 

1514 self._raw_data_id = dict(data_id) 

1515 self._universe = universe 

1516 if sorted_keys is not None: 

1517 self._reorder(sorted_keys) 

1518 

1519 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph: 

1520 """Make a bipartite init-only or runtime-only internal subgraph. 

1521 

1522 See `make_bipartite_xgraph` for parameters and return values. 

1523 

1524 Notes 

1525 ----- 

1526 This method returns a view of the `PipelineGraph` object's internal 

1527 backing graph, and hence should only be called in methods that copy the 

1528 result either explicitly or by running a copying algorithm before 

1529 returning it to the user. 

1530 """ 

1531 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)]) 

1532 

1533 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G: 

1534 """Transform networkx graph attributes in-place from the internal 

1535 "instance" attributes to the documented exported attributes. 

1536 

1537 Parameters 

1538 ---------- 

1539 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1540 Graph whose state should be transformed. 

1541 skip_edges : `bool` 

1542 If `True`, do not transform edge state. 

1543 

1544 Returns 

1545 ------- 

1546 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1547 The same object passed in, after modification. 

1548 

1549 Notes 

1550 ----- 

1551 This should be called after making a copy of the internal graph but 

1552 before any projection down to just task or dataset type nodes, since 

1553 it assumes stateful edges. 

1554 """ 

1555 state: dict[str, Any] 

1556 for state in xgraph.nodes.values(): 

1557 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance") 

1558 if node_value is not None: 

1559 state.update(node_value._to_xgraph_state()) 

1560 else: 

1561 # This is a dataset type node that is not resolved. 

1562 state["bipartite"] = NodeType.DATASET_TYPE.bipartite 

1563 if not skip_edges: 

1564 for _, _, state in xgraph.edges(data=True): 

1565 edge: Edge | None = state.pop("instance", None) 

1566 if edge is not None: 

1567 state.update(edge._to_xgraph_state()) 

1568 return xgraph 

1569 

1570 def _replace_task_nodes( 

1571 self, 

1572 updates: Mapping[str, TaskNode], 

1573 check_edges_unchanged: bool, 

1574 assume_edges_unchanged: bool, 

1575 message_header: str, 

1576 ) -> None: 

1577 """Replace task nodes and update edges and dataset type nodes 

1578 accordingly. 

1579 

1580 Parameters 

1581 ---------- 

1582 updates : `Mapping` [ `str`, `TaskNode` ] 

1583 New task nodes with task label keys. All keys must be task labels 

1584 that are already present in the graph. 

1585 check_edges_unchanged : `bool`, optional 

1586 If `True`, require the edges (connections) of the modified tasks to 

1587 remain unchanged after importing and configuring each task, and 

1588 verify that this is the case. 

1589 assume_edges_unchanged : `bool`, optional 

1590 If `True`, the caller declares that the edges (connections) of the 

1591 modified tasks will remain unchanged importing and configuring each 

1592 task, and that it is unnecessary to check this. 

1593 message_header : `str` 

1594 Template for `str.format` with a single ``task_label`` placeholder 

1595 to use as the first line in `EdgesChangedError` messages that show 

1596 the differences between new task edges and old task edges. Should 

1597 include the fact that the rest of the message will refer to the old 

1598 task as "A" and the new task as "B", and end with a colon. 

1599 

1600 Raises 

1601 ------ 

1602 ValueError 

1603 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

1604 are both `True`, or if a full config is provided for a task after 

1605 another full config or an override has already been provided. 

1606 EdgesChangedError 

1607 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

1608 change. 

1609 """ 

1610 deep: dict[str, TaskNode] = {} 

1611 shallow: dict[str, TaskNode] = {} 

1612 if assume_edges_unchanged: 

1613 if check_edges_unchanged: 

1614 raise ValueError("Cannot simultaneously assume and check that edges have not changed.") 

1615 shallow.update(updates) 

1616 else: 

1617 for task_label, new_task_node in updates.items(): 

1618 old_task_node = self.tasks[task_label] 

1619 messages = old_task_node.diff_edges(new_task_node) 

1620 if messages: 

1621 if check_edges_unchanged: 

1622 messages.insert(0, message_header.format(task_label=task_label)) 

1623 raise EdgesChangedError("\n".join(messages)) 

1624 else: 

1625 deep[task_label] = new_task_node 

1626 else: 

1627 shallow[task_label] = new_task_node 

1628 try: 

1629 if deep: 

1630 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True) 

1631 self.add_task_nodes(deep.values()) 

1632 for replaced_task_node, referencing_subsets in removed: 

1633 for subset_label in referencing_subsets: 

1634 self._task_subsets[subset_label].add(replaced_task_node.label) 

1635 for task_node in shallow.values(): 

1636 self._xgraph.nodes[task_node.key]["instance"] = task_node 

1637 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init 

1638 except PipelineGraphExceptionSafetyError: # pragma: no cover 

1639 raise 

1640 except Exception as err: # pragma: no cover 

1641 # There's no known way to get here, but we want to make it clear 

1642 # it's a big problem if we do. 

1643 raise PipelineGraphExceptionSafetyError( 

1644 "Error while replacing tasks has left the graph in an inconsistent state." 

1645 ) from err 

1646 

1647 def _append_graph_data_from_edge( 

1648 self, 

1649 node_data: list[tuple[NodeKey, dict[str, Any]]], 

1650 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]], 

1651 edge: Edge, 

1652 parent: PipelineGraph | None, 

1653 ) -> None: 

1654 """Append networkx state dictionaries for an edge and the corresponding 

1655 dataset type node. 

1656 

1657 Parameters 

1658 ---------- 

1659 node_data : `list` 

1660 List of node keys and state dictionaries. A node is appended if 

1661 one does not already exist for this dataset type. 

1662 edge_data : `list` 

1663 List of node key pairs, connection names, and state dictionaries 

1664 for edges. 

1665 edge : `Edge` 

1666 New edge being processed. 

1667 parent : `PipelineGraph` or `None` 

1668 Another pipeline graph whose dataset type nodes should be used 

1669 when present. 

1670 """ 

1671 new_dataset_type_node = None 

1672 if parent is not None: 

1673 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance") 

1674 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None: 

1675 existing_dataset_type_state["instance"] = new_dataset_type_node 

1676 else: 

1677 node_data.append( 

1678 ( 

1679 edge.dataset_type_key, 

1680 { 

1681 "instance": new_dataset_type_node, 

1682 "bipartite": NodeType.DATASET_TYPE.bipartite, 

1683 }, 

1684 ) 

1685 ) 

1686 edge_data.append( 

1687 edge.nodes 

1688 + ( 

1689 edge.connection_name, 

1690 {"instance": edge}, 

1691 ) 

1692 ) 

1693 

1694 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None: 

1695 """Set the order of all views of this graph from the given sorted 

1696 sequence of task labels and dataset type names. 

1697 """ 

1698 self._sorted_keys = sorted_keys 

1699 self._tasks._reorder(sorted_keys) 

1700 self._dataset_types._reorder(sorted_keys) 

1701 

1702 def _reset(self) -> None: 

1703 """Reset the all views of this graph following a modification that 

1704 might invalidate them. 

1705 """ 

1706 self._sorted_keys = None 

1707 self._tasks._reset() 

1708 self._dataset_types._reset() 

1709 

1710 _xgraph: networkx.MultiDiGraph 

1711 _sorted_keys: Sequence[NodeKey] | None 

1712 _task_subsets: dict[str, TaskSubset] 

1713 _description: str 

1714 _tasks: TaskMappingView 

1715 _dataset_types: DatasetTypeMappingView 

1716 _raw_data_id: dict[str, Any] 

1717 _universe: DimensionUniverse | None