Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 19%

373 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-13 09:52 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("PipelineGraph",) 

30 

31import gzip 

32import itertools 

33import json 

34from collections.abc import Iterable, Iterator, Mapping, Sequence 

35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast 

36 

37import networkx 

38import networkx.algorithms.bipartite 

39import networkx.algorithms.dag 

40from lsst.daf.butler import DataCoordinate, DataId, DimensionGraph, DimensionUniverse, Registry 

41from lsst.resources import ResourcePath, ResourcePathExpression 

42 

43from ._dataset_types import DatasetTypeNode 

44from ._edges import Edge, ReadEdge, WriteEdge 

45from ._exceptions import ( 

46 DuplicateOutputError, 

47 EdgesChangedError, 

48 PipelineDataCycleError, 

49 PipelineGraphError, 

50 PipelineGraphExceptionSafetyError, 

51 UnresolvedGraphError, 

52) 

53from ._mapping_views import DatasetTypeMappingView, TaskMappingView 

54from ._nodes import NodeKey, NodeType 

55from ._task_subsets import TaskSubset 

56from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData 

57 

58if TYPE_CHECKING: 

59 from ..config import PipelineTaskConfig 

60 from ..connections import PipelineTaskConnections 

61 from ..pipeline import TaskDef 

62 from ..pipelineTask import PipelineTask 

63 

64 

65_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph) 

66 

67 

68class PipelineGraph: 

69 """A graph representation of fully-configured pipeline. 

70 

71 `PipelineGraph` instances are typically constructed by calling 

72 `.Pipeline.to_graph`, but in rare cases constructing and then populating an 

73 empty one may be preferable. 

74 

75 Parameters 

76 ---------- 

77 description : `str`, optional 

78 String description for this pipeline. 

79 universe : `lsst.daf.butler.DimensionUniverse`, optional 

80 Definitions for all butler dimensions. If not provided, some 

81 attributes will not be available until `resolve` is called. 

82 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional 

83 Data ID that represents a constraint on all quanta generated by this 

84 pipeline. This typically just holds the instrument constraint included 

85 in the pipeline definition, if there was one. 

86 """ 

87 

88 ########################################################################### 

89 # 

90 # Simple Pipeline Graph Inspection Interface: 

91 # 

92 # - for inspecting graph structure, not modifying it (except to sort and] 

93 # resolve); 

94 # 

95 # - no NodeKey objects, just string dataset type name and task label keys; 

96 # 

97 # - graph structure is represented as a pair of mappings, with methods to 

98 # find neighbors and edges of nodes. 

99 # 

100 ########################################################################### 

101 

102 def __init__( 

103 self, 

104 *, 

105 description: str = "", 

106 universe: DimensionUniverse | None = None, 

107 data_id: DataId | None = None, 

108 ) -> None: 

109 self._init_from_args( 

110 xgraph=None, 

111 sorted_keys=None, 

112 task_subsets=None, 

113 description=description, 

114 universe=universe, 

115 data_id=data_id, 

116 ) 

117 

118 def __repr__(self) -> str: 

119 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})" 

120 

121 @property 

122 def description(self) -> str: 

123 """String description for this pipeline.""" 

124 return self._description 

125 

126 @description.setter 

127 def description(self, value: str) -> None: 

128 # Docstring in setter. 

129 self._description = value 

130 

131 @property 

132 def universe(self) -> DimensionUniverse | None: 

133 """Definitions for all butler dimensions.""" 

134 return self._universe 

135 

136 @property 

137 def data_id(self) -> DataCoordinate: 

138 """Data ID that represents a constraint on all quanta generated from 

139 this pipeline. 

140 

141 This is may not be available unless `universe` is not `None`. 

142 """ 

143 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe) 

144 

145 @property 

146 def tasks(self) -> TaskMappingView: 

147 """A mapping view of the tasks in the graph. 

148 

149 This mapping has `str` task label keys and `TaskNode` values. Iteration 

150 is topologically and deterministically ordered if and only if `sort` 

151 has been called since the last modification to the graph. 

152 """ 

153 return self._tasks 

154 

155 @property 

156 def dataset_types(self) -> DatasetTypeMappingView: 

157 """A mapping view of the dataset types in the graph. 

158 

159 This mapping has `str` parent dataset type name keys, but only provides 

160 access to its `DatasetTypeNode` values if `resolve` has been called 

161 since the last modification involving a task that uses a dataset type. 

162 See `DatasetTypeMappingView` for details. 

163 """ 

164 return self._dataset_types 

165 

166 @property 

167 def task_subsets(self) -> Mapping[str, TaskSubset]: 

168 """A mapping of all labeled subsets of tasks. 

169 

170 Keys are subset labels, values are sets of task labels. See 

171 `TaskSubset` for more information. 

172 

173 Use `add_task_subset` to add a new subset. The subsets themselves may 

174 be modified in-place. 

175 """ 

176 return self._task_subsets 

177 

178 @property 

179 def is_sorted(self) -> bool: 

180 """Whether this graph's tasks and dataset types are topologically 

181 sorted with the exact same deterministic tiebreakers that `sort` would 

182 apply. 

183 

184 This may perform (and then discard) a full sort if `has_been_sorted` is 

185 `False`. If the goal is to obtain a sorted graph, it is better to just 

186 call `sort` without guarding that with an ``if not graph.is_sorted`` 

187 check. 

188 """ 

189 if self._sorted_keys is not None: 

190 return True 

191 return all( 

192 sorted == unsorted 

193 for sorted, unsorted in zip( 

194 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True 

195 ) 

196 ) 

197 

198 @property 

199 def has_been_sorted(self) -> bool: 

200 """Whether this graph's tasks and dataset types have been 

201 topologically sorted (with unspecified but deterministic tiebreakers) 

202 since the last modification to the graph. 

203 

204 This may return `False` if the graph *happens* to be sorted but `sort` 

205 was never called, but it is potentially much faster than `is_sorted`, 

206 which may attempt (and then discard) a full sort if `has_been_sorted` 

207 is `False`. 

208 """ 

209 return self._sorted_keys is not None 

210 

211 def sort(self) -> None: 

212 """Sort this graph's nodes topologically with deterministic (but 

213 unspecified) tiebreakers. 

214 

215 This does nothing if the graph is already known to be sorted. 

216 """ 

217 if self._sorted_keys is None: 

218 try: 

219 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph)) 

220 except networkx.NetworkXUnfeasible as err: # pragma: no cover 

221 # Should't be possible to get here, because we check for cycles 

222 # when adding tasks, but we guard against it anyway. 

223 cycle = networkx.find_cycle(self._xgraph) 

224 raise PipelineDataCycleError( 

225 f"Cycle detected while attempting to sort graph: {cycle}." 

226 ) from err 

227 self._reorder(sorted_keys) 

228 

229 def copy(self) -> PipelineGraph: 

230 """Return a copy of this graph that copies all mutable state.""" 

231 xgraph = self._xgraph.copy() 

232 result = PipelineGraph.__new__(PipelineGraph) 

233 result._init_from_args( 

234 xgraph, 

235 self._sorted_keys, 

236 task_subsets={ 

237 k: TaskSubset(xgraph, v.label, set(v._members), v.description) 

238 for k, v in self._task_subsets.items() 

239 }, 

240 description=self._description, 

241 universe=self.universe, 

242 data_id=self._raw_data_id, 

243 ) 

244 return result 

245 

246 def __copy__(self) -> PipelineGraph: 

247 # Fully shallow copies are dangerous; we don't want shared mutable 

248 # state to lead to broken class invariants. 

249 return self.copy() 

250 

251 def __deepcopy__(self, memo: dict) -> PipelineGraph: 

252 # Genuine deep copies are unnecessary, since we should only ever care 

253 # that mutable state is copied. 

254 return self.copy() 

255 

256 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None: 

257 """Return the `WriteEdge` that links the producing task to the named 

258 dataset type. 

259 

260 Parameters 

261 ---------- 

262 dataset_type_name : `str` 

263 Dataset type name. Must not be a component. 

264 

265 Returns 

266 ------- 

267 edge : `WriteEdge` or `None` 

268 Producing edge or `None` if there isn't one in this graph. 

269 

270 Raises 

271 ------ 

272 DuplicateOutputError 

273 Raised if there are multiple tasks defined to produce this dataset 

274 type. This is only possible if the graph's dataset types are not 

275 resolved. 

276 

277 Notes 

278 ----- 

279 On resolved graphs, it may be slightly more efficient to use:: 

280 

281 graph.dataset_types[dataset_type_name].producing_edge 

282 

283 but this method works on graphs with unresolved dataset types as well. 

284 """ 

285 producer: str | None = None 

286 producing_edge: WriteEdge | None = None 

287 for _, _, producing_edge in self._xgraph.in_edges( 

288 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

289 ): 

290 assert producing_edge is not None, "Should only be None if we never loop." 

291 if producer is not None: 

292 raise DuplicateOutputError( 

293 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} " 

294 f"and {producer!r}." 

295 ) 

296 return producing_edge 

297 

298 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]: 

299 """Return the `ReadEdge` objects that link the named dataset type to 

300 the tasks that consume it. 

301 

302 Parameters 

303 ---------- 

304 dataset_type_name : `str` 

305 Dataset type name. Must not be a component. 

306 

307 Returns 

308 ------- 

309 edges : `list` [ `ReadEdge` ] 

310 Edges that connect this dataset type to the tasks that consume it. 

311 

312 Notes 

313 ----- 

314 On resolved graphs, it may be slightly more efficient to use:: 

315 

316 graph.dataset_types[dataset_type_name].producing_edges 

317 

318 but this method works on graphs with unresolved dataset types as well. 

319 """ 

320 return [ 

321 edge 

322 for _, _, edge in self._xgraph.out_edges( 

323 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

324 ) 

325 ] 

326 

327 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None: 

328 """Return the `TaskNode` or `TaskInitNode` that writes the given 

329 dataset type. 

330 

331 Parameters 

332 ---------- 

333 dataset_type_name : `str` 

334 Dataset type name. Must not be a component. 

335 

336 Returns 

337 ------- 

338 edge : `TaskNode`, `TaskInitNode`, or `None` 

339 Producing node or `None` if there isn't one in this graph. 

340 

341 Raises 

342 ------ 

343 DuplicateOutputError 

344 Raised if there are multiple tasks defined to produce this dataset 

345 type. This is only possible if the graph's dataset types are not 

346 resolved. 

347 """ 

348 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None: 

349 return self._xgraph.nodes[producing_edge.task_key]["instance"] 

350 return None 

351 

352 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]: 

353 """Return the `TaskNode` and/or `TaskInitNode` objects that read 

354 the given dataset type. 

355 

356 Parameters 

357 ---------- 

358 dataset_type_name : `str` 

359 Dataset type name. Must not be a component. 

360 

361 Returns 

362 ------- 

363 edges : `list` [ `ReadEdge` ] 

364 Edges that connect this dataset type to the tasks that consume it. 

365 

366 Notes 

367 ----- 

368 On resolved graphs, it may be slightly more efficient to use:: 

369 

370 graph.dataset_types[dataset_type_name].producing_edges 

371 

372 but this method works on graphs with unresolved dataset types as well. 

373 """ 

374 return [ 

375 self._xgraph.nodes[consuming_edge.task_key]["instance"] 

376 for consuming_edge in self.consuming_edges_of(dataset_type_name) 

377 ] 

378 

379 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]: 

380 """Return the dataset types that are inputs to a task. 

381 

382 Parameters 

383 ---------- 

384 task_label : `str` 

385 Label for the task in the pipeline. 

386 init : `bool`, optional 

387 If `True`, return init-input dataset types instead of runtime 

388 (including prerequisite) inputs. 

389 

390 Returns 

391 ------- 

392 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

393 Dictionary parent dataset type name keys and either 

394 `DatasetTypeNode` values (if the dataset type has been resolved) 

395 or `None` values. 

396 

397 Notes 

398 ----- 

399 To get the input edges of a task or task init node (which provide 

400 information about storage class overrides nd components) use:: 

401 

402 graph.tasks[task_label].iter_all_inputs() 

403 

404 or 

405 

406 graph.tasks[task_label].init.iter_all_inputs() 

407 

408 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

409 class. 

410 """ 

411 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

412 return { 

413 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

414 for edge in node.iter_all_inputs() 

415 } 

416 

417 def outputs_of( 

418 self, task_label: str, init: bool = False, include_automatic_connections: bool = True 

419 ) -> dict[str, DatasetTypeNode | None]: 

420 """Return the dataset types that are outputs of a task. 

421 

422 Parameters 

423 ---------- 

424 task_label : `str` 

425 Label for the task in the pipeline. 

426 init : `bool`, optional 

427 If `True`, return init-output dataset types instead of runtime 

428 outputs. 

429 include_automatic_connections : `bool`, optional 

430 Whether to include automatic connections such as configs, metadata, 

431 and logs. 

432 

433 Returns 

434 ------- 

435 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

436 Dictionary parent dataset type name keys and either 

437 `DatasetTypeNode` values (if the dataset type has been resolved) 

438 or `None` values. 

439 

440 Notes 

441 ----- 

442 To get the input edges of a task or task init node (which provide 

443 information about storage class overrides nd components) use:: 

444 

445 graph.tasks[task_label].iter_all_outputs() 

446 

447 or 

448 

449 graph.tasks[task_label].init.iter_all_outputs() 

450 

451 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

452 class. 

453 """ 

454 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

455 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values() 

456 return { 

457 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

458 for edge in iterable 

459 } 

460 

461 def resolve(self, registry: Registry) -> None: 

462 """Resolve all dimensions and dataset types and check them for 

463 consistency. 

464 

465 Resolving a graph also causes it to be sorted. 

466 

467 Parameters 

468 ---------- 

469 registry : `lsst.daf.butler.Registry` 

470 Client for the data repository to resolve against. 

471 

472 Notes 

473 ----- 

474 The `universe` attribute is set to ``registry.dimensions`` and used to 

475 set all `TaskNode.dimensions` attributes. Dataset type nodes are 

476 resolved by first looking for a registry definition, then using the 

477 producing task's definition, then looking for consistency between all 

478 consuming task definitions. 

479 

480 Raises 

481 ------ 

482 ConnectionTypeConsistencyError 

483 Raised if a prerequisite input for one task appears as a different 

484 kind of connection in any other task. 

485 DuplicateOutputError 

486 Raised if multiple tasks have the same dataset type as an output. 

487 IncompatibleDatasetTypeError 

488 Raised if different tasks have different definitions of a dataset 

489 type. Different but compatible storage classes are permitted. 

490 MissingDatasetTypeError 

491 Raised if a dataset type definition is required to exist in the 

492 data repository but none was found. This should only occur for 

493 dataset types that are not produced by a task in the pipeline and 

494 are consumed with different storage classes or as components by 

495 tasks in the pipeline. 

496 EdgesChangedError 

497 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

498 change after import and reconfiguration. 

499 """ 

500 node_key: NodeKey 

501 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {} 

502 for node_key, node_state in self._xgraph.nodes.items(): 

503 match node_key.node_type: 

504 case NodeType.TASK: 

505 task_node: TaskNode = node_state["instance"] 

506 new_task_node = task_node._resolved(registry.dimensions) 

507 if new_task_node is not task_node: 

508 updates[node_key] = new_task_node 

509 case NodeType.DATASET_TYPE: 

510 dataset_type_node: DatasetTypeNode | None = node_state["instance"] 

511 new_dataset_type_node = DatasetTypeNode._from_edges( 

512 node_key, self._xgraph, registry, previous=dataset_type_node 

513 ) 

514 # Usage of `is`` here is intentional; `_from_edges` returns 

515 # `previous=dataset_type_node` if it can determine that it 

516 # doesn't need to change. 

517 if new_dataset_type_node is not dataset_type_node: 

518 updates[node_key] = new_dataset_type_node 

519 try: 

520 for node_key, node_value in updates.items(): 

521 self._xgraph.nodes[node_key]["instance"] = node_value 

522 except Exception as err: # pragma: no cover 

523 # There's no known way to get here, but we want to make it 

524 # clear it's a big problem if we do. 

525 raise PipelineGraphExceptionSafetyError( 

526 "Error during dataset type resolution has left the graph in an inconsistent state." 

527 ) from err 

528 self.sort() 

529 self._universe = registry.dimensions 

530 

531 ########################################################################### 

532 # 

533 # Graph Modification Interface: 

534 # 

535 # - methods to add, remove, and replace tasks; 

536 # 

537 # - methods to add and remove task subsets. 

538 # 

539 # These are all things that are usually done in a Pipeline before making a 

540 # graph at all, but there may be cases where we want to modify the graph 

541 # instead. (These are also the methods used to make a graph from a 

542 # Pipeline, or make a graph from another graph.) 

543 # 

544 ########################################################################### 

545 

546 def add_task( 

547 self, 

548 label: str, 

549 task_class: type[PipelineTask], 

550 config: PipelineTaskConfig, 

551 connections: PipelineTaskConnections | None = None, 

552 ) -> TaskNode: 

553 """Add a new task to the graph. 

554 

555 Parameters 

556 ---------- 

557 label : `str` 

558 Label for the task in the pipeline. 

559 task_class : `type` [ `PipelineTask` ] 

560 Class object for the task. 

561 config : `PipelineTaskConfig` 

562 Configuration for the task. 

563 connections : `PipelineTaskConnections`, optional 

564 Object that describes the dataset types used by the task. If not 

565 provided, one will be constructed from the given configuration. If 

566 provided, it is assumed that ``config`` has already been validated 

567 and frozen. 

568 

569 Returns 

570 ------- 

571 node : `TaskNode` 

572 The new task node added to the graph. 

573 

574 Raises 

575 ------ 

576 ValueError 

577 Raised if configuration validation failed when constructing 

578 ``connections``. 

579 PipelineDataCycleError 

580 Raised if the graph is cyclic after this addition. 

581 RuntimeError 

582 Raised if an unexpected exception (which will be chained) occurred 

583 at a stage that may have left the graph in an inconsistent state. 

584 Other exceptions should leave the graph unchanged. 

585 

586 Notes 

587 ----- 

588 Checks for dataset type consistency and multiple producers do not occur 

589 until `resolve` is called, since the resolution depends on both the 

590 state of the data repository and all contributing tasks. 

591 

592 Adding new tasks removes any existing resolutions of all dataset types 

593 it references and marks the graph as unsorted. It is most effiecient 

594 to add all tasks up front and only then resolve and/or sort the graph. 

595 """ 

596 task_node = TaskNode._from_imported_data( 

597 key=NodeKey(NodeType.TASK, label), 

598 init_key=NodeKey(NodeType.TASK_INIT, label), 

599 data=_TaskNodeImportedData.configure(label, task_class, config, connections), 

600 universe=self.universe, 

601 ) 

602 self.add_task_nodes([task_node]) 

603 return task_node 

604 

605 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None: 

606 """Add one or more existing task nodes to the graph. 

607 

608 Parameters 

609 ---------- 

610 nodes : `~collections.abc.Iterable` [ `TaskNode` ] 

611 Iterable of task nodes to add. If any tasks have resolved 

612 dimensions, they must have the same dimension universe as the rest 

613 of the graph. 

614 parent : `PipelineGraph`, optional 

615 If provided, another `PipelineGraph` from which these nodes were 

616 obtained. Any dataset type nodes already present in ``parent`` 

617 that are referenced by the given tasks will be used in this graph 

618 if they are not already present, preserving any dataset type 

619 resolutions present in the parent graph. Adding nodes from a 

620 parent graph after the graph has its own nodes (e.g. from 

621 `add_task`) or nodes from a third graph may result in invalid 

622 dataset type resolutions. It is safest to only use this argument 

623 when populating an empty graph for the first time. 

624 

625 Raises 

626 ------ 

627 PipelineDataCycleError 

628 Raised if the graph is cyclic after this addition. 

629 

630 Notes 

631 ----- 

632 Checks for dataset type consistency and multiple producers do not occur 

633 until `resolve` is called, since the resolution depends on both the 

634 state of the data repository and all contributing tasks. 

635 

636 Adding new tasks removes any existing resolutions of all dataset types 

637 it references (unless ``parent is not None`` and marks the graph as 

638 unsorted. It is most efficient to add all tasks up front and only then 

639 resolve and/or sort the graph. 

640 """ 

641 node_data: list[tuple[NodeKey, dict[str, Any]]] = [] 

642 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = [] 

643 for task_node in nodes: 

644 task_node = task_node._resolved(self._universe) 

645 node_data.append( 

646 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite}) 

647 ) 

648 node_data.append( 

649 ( 

650 task_node.init.key, 

651 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite}, 

652 ) 

653 ) 

654 # Convert the edge objects attached to the task node to networkx. 

655 for read_edge in task_node.init.iter_all_inputs(): 

656 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

657 for write_edge in task_node.init.iter_all_outputs(): 

658 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

659 for read_edge in task_node.iter_all_inputs(): 

660 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

661 for write_edge in task_node.iter_all_outputs(): 

662 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

663 # Add a special edge (with no Edge instance) that connects the 

664 # TaskInitNode to the runtime TaskNode. 

665 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None})) 

666 if not node_data and not edge_data: 

667 return 

668 # Checks and preparation complete; time to start the actual 

669 # modification, during which it's hard to provide strong exception 

670 # safety. Start by resetting the sort ordering, if there is one. 

671 self._reset() 

672 try: 

673 self._xgraph.add_nodes_from(node_data) 

674 self._xgraph.add_edges_from(edge_data) 

675 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph): 

676 cycle = networkx.find_cycle(self._xgraph) 

677 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.") 

678 except Exception: 

679 # First try to roll back our changes. 

680 try: 

681 self._xgraph.remove_edges_from(edge_data) 

682 self._xgraph.remove_nodes_from(key for key, _ in node_data) 

683 except Exception as err: # pragma: no cover 

684 # There's no known way to get here, but we want to make it 

685 # clear it's a big problem if we do. 

686 raise PipelineGraphExceptionSafetyError( 

687 "Error while attempting to revert PipelineGraph modification has left the graph in " 

688 "an inconsistent state." 

689 ) from err 

690 # Successfully rolled back; raise the original exception. 

691 raise 

692 

693 def reconfigure_tasks( 

694 self, 

695 *args: tuple[str, PipelineTaskConfig], 

696 check_edges_unchanged: bool = False, 

697 assume_edges_unchanged: bool = False, 

698 **kwargs: PipelineTaskConfig, 

699 ) -> None: 

700 """Update the configuration for one or more tasks. 

701 

702 Parameters 

703 ---------- 

704 *args : `tuple` [ `str`, `.PipelineTaskConfig` ] 

705 Positional arguments are each a 2-tuple of task label and new 

706 config object. Note that the same arguments may also be passed as 

707 ``**kwargs``, which is usually more readable, but task labels in 

708 ``*args`` are not required to be valid Python identifiers. 

709 check_edges_unchanged : `bool`, optional 

710 If `True`, require the edges (connections) of the modified tasks to 

711 remain unchanged after the configuration updates, and verify that 

712 this is the case. 

713 assume_edges_unchanged : `bool`, optional 

714 If `True`, the caller declares that the edges (connections) of the 

715 modified tasks will remain unchanged after the configuration 

716 updates, and that it is unnecessary to check this. 

717 **kwargs : `.PipelineTaskConfig` 

718 New config objects or overrides to apply to copies of the current 

719 config objects, with task labels as the keywords. 

720 

721 Raises 

722 ------ 

723 ValueError 

724 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

725 are both `True`, or if the same task appears twice. 

726 EdgesChangedError 

727 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

728 change. 

729 

730 Notes 

731 ----- 

732 If reconfiguring a task causes its edges to change, any dataset type 

733 nodes connected to that task (not just those whose edges have changed!) 

734 will be unresolved. 

735 """ 

736 new_configs: dict[str, PipelineTaskConfig] = {} 

737 for task_label, config_update in itertools.chain(args, kwargs.items()): 

738 if new_configs.setdefault(task_label, config_update) is not config_update: 

739 raise ValueError(f"Config for {task_label!r} provided more than once.") 

740 updates = { 

741 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged) 

742 for task_label, config in new_configs.items() 

743 } 

744 self._replace_task_nodes( 

745 updates, 

746 check_edges_unchanged=check_edges_unchanged, 

747 assume_edges_unchanged=assume_edges_unchanged, 

748 message_header=( 

749 "Unexpected change in edges for task {task_label!r} from original config (A) to " 

750 "new configs (B):" 

751 ), 

752 ) 

753 

754 def remove_tasks( 

755 self, labels: Iterable[str], drop_from_subsets: bool = True 

756 ) -> list[tuple[TaskNode, set[str]]]: 

757 """Remove one or more tasks from the graph. 

758 

759 Parameters 

760 ---------- 

761 labels : `~collections.abc.Iterable` [ `str` ] 

762 Iterable of the labels of the tasks to remove. 

763 drop_from_subsets : `bool`, optional 

764 If `True`, drop each removed task from any subset in which it 

765 currently appears. If `False`, raise `PipelineGraphError` if any 

766 such subsets exist. 

767 

768 Returns 

769 ------- 

770 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ] 

771 List of nodes removed and the labels of task subsets that 

772 referenced them. 

773 

774 Raises 

775 ------ 

776 PipelineGraphError 

777 Raised if ``drop_from_subsets`` is `False` and the task is still 

778 part of one or more subsets. 

779 

780 Notes 

781 ----- 

782 Removing a task will cause dataset nodes with no other referencing 

783 tasks to be removed. Any other dataset type nodes referenced by a 

784 removed task will be reset to an "unresolved" state. 

785 """ 

786 task_nodes_and_subsets = [] 

787 dataset_types: set[NodeKey] = set() 

788 nodes_to_remove = set() 

789 for label in labels: 

790 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"] 

791 # Find task subsets that reference this task. 

792 referencing_subsets = { 

793 subset_label 

794 for subset_label, task_subset in self.task_subsets.items() 

795 if label in task_subset 

796 } 

797 if not drop_from_subsets and referencing_subsets: 

798 raise PipelineGraphError( 

799 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}." 

800 ) 

801 task_nodes_and_subsets.append((task_node, referencing_subsets)) 

802 # Find dataset types referenced by this task. 

803 dataset_types.update(self._xgraph.predecessors(task_node.key)) 

804 dataset_types.update(self._xgraph.successors(task_node.key)) 

805 dataset_types.update(self._xgraph.predecessors(task_node.init.key)) 

806 dataset_types.update(self._xgraph.successors(task_node.init.key)) 

807 # Since there's an edge between the task and its init node, we'll 

808 # have added those two nodes here, too, and we don't want that. 

809 dataset_types.remove(task_node.init.key) 

810 dataset_types.remove(task_node.key) 

811 # Mark the task node and its init node for removal from the graph. 

812 nodes_to_remove.add(task_node.key) 

813 nodes_to_remove.add(task_node.init.key) 

814 # Process the referenced datasets to see which ones are orphaned and 

815 # need to be removed vs. just unresolved. 

816 nodes_to_unresolve = [] 

817 for dataset_type_key in dataset_types: 

818 related_tasks = set() 

819 related_tasks.update(self._xgraph.predecessors(dataset_type_key)) 

820 related_tasks.update(self._xgraph.successors(dataset_type_key)) 

821 related_tasks.difference_update(nodes_to_remove) 

822 if not related_tasks: 

823 nodes_to_remove.add(dataset_type_key) 

824 else: 

825 nodes_to_unresolve.append(dataset_type_key) 

826 # Checks and preparation complete; time to start the actual 

827 # modification, during which it's hard to provide strong exception 

828 # safety. Start by resetting the sort ordering. 

829 self._reset() 

830 try: 

831 for dataset_type_key in nodes_to_unresolve: 

832 self._xgraph.nodes[dataset_type_key]["instance"] = None 

833 for task_node, referencing_subsets in task_nodes_and_subsets: 

834 for subset_label in referencing_subsets: 

835 self._task_subsets[subset_label].remove(task_node.label) 

836 self._xgraph.remove_nodes_from(nodes_to_remove) 

837 except Exception as err: # pragma: no cover 

838 # There's no known way to get here, but we want to make it 

839 # clear it's a big problem if we do. 

840 raise PipelineGraphExceptionSafetyError( 

841 "Error during task removal has left the graph in an inconsistent state." 

842 ) from err 

843 return task_nodes_and_subsets 

844 

845 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None: 

846 """Add a label for a set of tasks that are already in the pipeline. 

847 

848 Parameters 

849 ---------- 

850 subset_label : `str` 

851 Label for this set of tasks. 

852 task_labels : `~collections.abc.Iterable` [ `str` ] 

853 Labels of the tasks to include in the set. All must already be 

854 included in the graph. 

855 description : `str`, optional 

856 String description to associate with this label. 

857 """ 

858 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description) 

859 self._task_subsets[subset_label] = subset 

860 

861 def remove_task_subset(self, subset_label: str) -> None: 

862 """Remove a labeled set of tasks.""" 

863 del self._task_subsets[subset_label] 

864 

865 ########################################################################### 

866 # 

867 # NetworkX Export Interface: 

868 # 

869 # - methods to export the PipelineGraph's content (or various subsets 

870 # thereof) as NetworkX objects. 

871 # 

872 # These are particularly useful when writing tools to visualize the graph, 

873 # while providing options for which aspects of the graph (tasks, dataset 

874 # types, or both) to include, since all exported graphs have similar 

875 # attributes regardless of their structure. 

876 # 

877 ########################################################################### 

878 

879 def make_xgraph(self) -> networkx.MultiDiGraph: 

880 """Export a networkx representation of the full pipeline graph, 

881 including both init and runtime edges. 

882 

883 Returns 

884 ------- 

885 xgraph : `networkx.MultiDiGraph` 

886 Directed acyclic graph with parallel edges. 

887 

888 Notes 

889 ----- 

890 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

891 represent the same dataset type appearing in multiple connections for 

892 the same task, and are hence rare. The connection name is used as the 

893 edge key to disambiguate those parallel edges. 

894 

895 Almost all edges connect dataset type nodes to task or task init nodes 

896 or vice versa, but there is also a special edge that connects each task 

897 init node to its runtime node. The existence of these edges makes the 

898 graph not quite bipartite, though its init-only and runtime-only 

899 subgraphs are bipartite. 

900 

901 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

902 `WriteEdge` for the descriptive node and edge attributes added. 

903 """ 

904 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False) 

905 

906 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph: 

907 """Return a bipartite networkx representation of just the runtime or 

908 init-time pipeline graph. 

909 

910 Parameters 

911 ---------- 

912 init : `bool`, optional 

913 If `True` (`False` is default) return the graph of task 

914 initialization nodes and init input/output dataset types, instead 

915 of the graph of runtime task nodes and regular 

916 input/output/prerequisite dataset types. 

917 

918 Returns 

919 ------- 

920 xgraph : `networkx.MultiDiGraph` 

921 Directed acyclic graph with parallel edges. 

922 

923 Notes 

924 ----- 

925 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

926 represent the same dataset type appearing in multiple connections for 

927 the same task, and are hence rare. The connection name is used as the 

928 edge key to disambiguate those parallel edges. 

929 

930 This graph is bipartite because each dataset type node only has edges 

931 that connect it to a task [init] node, and vice versa. 

932 

933 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

934 `WriteEdge` for the descriptive node and edge attributes added. 

935 """ 

936 return self._transform_xgraph_state( 

937 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False 

938 ) 

939 

940 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph: 

941 """Return a networkx representation of just the tasks in the pipeline. 

942 

943 Parameters 

944 ---------- 

945 init : `bool`, optional 

946 If `True` (`False` is default) return the graph of task 

947 initialization nodes, instead of the graph of runtime task nodes. 

948 

949 Returns 

950 ------- 

951 xgraph : `networkx.DiGraph` 

952 Directed acyclic graph with no parallel edges. 

953 

954 Notes 

955 ----- 

956 The returned graph uses `NodeKey` instances for nodes. The dataset 

957 types that link these tasks are not represented at all; edges have no 

958 attributes, and there are no parallel edges. 

959 

960 See `TaskNode` and `TaskInitNode` for the descriptive node and 

961 attributes added. 

962 """ 

963 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

964 task_keys = [ 

965 key 

966 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

967 if bipartite == NodeType.TASK.bipartite 

968 ] 

969 return self._transform_xgraph_state( 

970 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys), 

971 skip_edges=True, 

972 ) 

973 

974 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph: 

975 """Return a networkx representation of just the dataset types in the 

976 pipeline. 

977 

978 Parameters 

979 ---------- 

980 init : `bool`, optional 

981 If `True` (`False` is default) return the graph of init input and 

982 output dataset types, instead of the graph of runtime (input, 

983 output, prerequisite input) dataset types. 

984 

985 Returns 

986 ------- 

987 xgraph : `networkx.DiGraph` 

988 Directed acyclic graph with no parallel edges. 

989 

990 Notes 

991 ----- 

992 The returned graph uses `NodeKey` instances for nodes. The tasks that 

993 link these tasks are not represented at all; edges have no attributes, 

994 and there are no parallel edges. 

995 

996 See `DatasetTypeNode` for the descriptive node and attributes added. 

997 """ 

998 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

999 dataset_type_keys = [ 

1000 key 

1001 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1002 if bipartite == NodeType.DATASET_TYPE.bipartite 

1003 ] 

1004 return self._transform_xgraph_state( 

1005 networkx.algorithms.bipartite.projected_graph( 

1006 networkx.DiGraph(bipartite_xgraph), dataset_type_keys 

1007 ), 

1008 skip_edges=True, 

1009 ) 

1010 

1011 ########################################################################### 

1012 # 

1013 # Serialization Interface. 

1014 # 

1015 # Serialization of PipelineGraphs is currently experimental and may not be 

1016 # retained in the future. All serialization methods are 

1017 # underscore-prefixed to ensure nobody mistakes them for a stable interface 

1018 # (let a lone a stable file format). 

1019 # 

1020 ########################################################################### 

1021 

1022 @classmethod 

1023 def _read_stream( 

1024 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1025 ) -> PipelineGraph: 

1026 """Read a serialized `PipelineGraph` from a file-like object. 

1027 

1028 Parameters 

1029 ---------- 

1030 stream : `BinaryIO` 

1031 File-like object opened for binary reading, containing 

1032 gzip-compressed JSON. 

1033 import_mode : `TaskImportMode`, optional 

1034 Whether to import tasks, and how to reconcile any differences 

1035 between the imported task's connections and the those that were 

1036 persisted with the graph. Default is to check that they are the 

1037 same. 

1038 

1039 Returns 

1040 ------- 

1041 graph : `PipelineGraph` 

1042 Deserialized pipeline graph. 

1043 

1044 Raises 

1045 ------ 

1046 PipelineGraphReadError 

1047 Raised if the serialized `PipelineGraph` is not self-consistent. 

1048 EdgesChangedError 

1049 Raised if ``import_mode`` is 

1050 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1051 did change after import and reconfiguration. 

1052 

1053 Notes 

1054 ----- 

1055 `PipelineGraph` serialization is currently experimental and may be 

1056 removed or significantly changed in the future, with no deprecation 

1057 period. 

1058 """ 

1059 from .io import SerializedPipelineGraph 

1060 

1061 with gzip.open(stream, "rb") as uncompressed_stream: 

1062 data = json.load(uncompressed_stream) 

1063 serialized_graph = SerializedPipelineGraph.parse_obj(data) 

1064 return serialized_graph.deserialize(import_mode) 

1065 

1066 @classmethod 

1067 def _read_uri( 

1068 cls, 

1069 uri: ResourcePathExpression, 

1070 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES, 

1071 ) -> PipelineGraph: 

1072 """Read a serialized `PipelineGraph` from a file at a URI. 

1073 

1074 Parameters 

1075 ---------- 

1076 uri : convertible to `lsst.resources.ResourcePath` 

1077 URI to a gzip-compressed JSON file containing a serialized pipeline 

1078 graph. 

1079 import_mode : `TaskImportMode`, optional 

1080 Whether to import tasks, and how to reconcile any differences 

1081 between the imported task's connections and the those that were 

1082 persisted with the graph. Default is to check that they are the 

1083 same. 

1084 

1085 Returns 

1086 ------- 

1087 graph : `PipelineGraph` 

1088 Deserialized pipeline graph. 

1089 

1090 Raises 

1091 ------ 

1092 PipelineGraphReadError 

1093 Raised if the serialized `PipelineGraph` is not self-consistent. 

1094 EdgesChangedError 

1095 Raised if ``import_mode`` is 

1096 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1097 did change after import and reconfiguration. 

1098 

1099 Notes 

1100 ----- 

1101 `PipelineGraph` serialization is currently experimental and may be 

1102 removed or significantly changed in the future, with no deprecation 

1103 period. 

1104 """ 

1105 uri = ResourcePath(uri) 

1106 with uri.open("rb") as stream: 

1107 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode) 

1108 

1109 def _write_stream(self, stream: BinaryIO) -> None: 

1110 """Write the pipeline to a file-like object. 

1111 

1112 Parameters 

1113 ---------- 

1114 stream 

1115 File-like object opened for binary writing. 

1116 

1117 Notes 

1118 ----- 

1119 `PipelineGraph` serialization is currently experimental and may be 

1120 removed or significantly changed in the future, with no deprecation 

1121 period. 

1122 

1123 The file format is gzipped JSON, and is intended to be human-readable, 

1124 but it should not be considered a stable public interface for outside 

1125 code, which should always use `PipelineGraph` methods (or at least the 

1126 `io.SerializedPipelineGraph` class) to read these files. 

1127 """ 

1128 from .io import SerializedPipelineGraph 

1129 

1130 with gzip.open(stream, mode="wb") as compressed_stream: 

1131 compressed_stream.write( 

1132 SerializedPipelineGraph.serialize(self).json(exclude_defaults=True).encode("utf-8") 

1133 ) 

1134 

1135 def _write_uri(self, uri: ResourcePathExpression) -> None: 

1136 """Write the pipeline to a file given a URI. 

1137 

1138 Parameters 

1139 ---------- 

1140 uri : convertible to `lsst.resources.ResourcePath` 

1141 URI to write to . May have ``.json.gz`` or no extension (which 

1142 will cause a ``.json.gz`` extension to be added). 

1143 

1144 Notes 

1145 ----- 

1146 `PipelineGraph` serialization is currently experimental and may be 

1147 removed or significantly changed in the future, with no deprecation 

1148 period. 

1149 

1150 The file format is gzipped JSON, and is intended to be human-readable, 

1151 but it should not be considered a stable public interface for outside 

1152 code, which should always use `PipelineGraph` methods (or at least the 

1153 `io.SerializedPipelineGraph` class) to read these files. 

1154 """ 

1155 uri = ResourcePath(uri) 

1156 extension = uri.getExtension() 

1157 if not extension: 

1158 uri = uri.updatedExtension(".json.gz") 

1159 elif extension != ".json.gz": 

1160 raise ValueError("Expanded pipeline files should always have a .json.gz extension.") 

1161 with uri.open(mode="wb") as stream: 

1162 self._write_stream(cast(BinaryIO, stream)) 

1163 

1164 def _import_and_configure( 

1165 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1166 ) -> None: 

1167 """Import the `PipelineTask` classes referenced by all task nodes and 

1168 update those nodes accordingly. 

1169 

1170 Parameters 

1171 ---------- 

1172 import_mode : `TaskImportMode`, optional 

1173 Whether to import tasks, and how to reconcile any differences 

1174 between the imported task's connections and the those that were 

1175 persisted with the graph. Default is to check that they are the 

1176 same. This method does nothing if this is 

1177 `TaskImportMode.DO_NOT_IMPORT`. 

1178 

1179 Raises 

1180 ------ 

1181 EdgesChangedError 

1182 Raised if ``import_mode`` is 

1183 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1184 did change after import and reconfiguration. 

1185 

1186 Notes 

1187 ----- 

1188 This method shouldn't need to be called unless the graph was 

1189 deserialized without importing and configuring immediately, which is 

1190 not the default behavior (but it can greatly speed up deserialization). 

1191 If all tasks have already been imported this does nothing. 

1192 

1193 Importing and configuring a task can change its 

1194 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output, 

1195 usually because the software used to read a serialized graph is newer 

1196 than the software used to write it (e.g. a new config option has been 

1197 added, or the task was moved to a new module with a forwarding alias 

1198 left behind). These changes are allowed by 

1199 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`. 

1200 

1201 If importing and configuring a task causes its edges to change, any 

1202 dataset type nodes linked to those edges will be reset to the 

1203 unresolved state. 

1204 """ 

1205 if import_mode is TaskImportMode.DO_NOT_IMPORT: 

1206 return 

1207 rebuild = ( 

1208 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1209 or import_mode is TaskImportMode.OVERRIDE_EDGES 

1210 ) 

1211 updates: dict[str, TaskNode] = {} 

1212 node_key: NodeKey 

1213 for node_key, node_state in self._xgraph.nodes.items(): 

1214 if node_key.node_type is NodeType.TASK: 

1215 task_node: TaskNode = node_state["instance"] 

1216 new_task_node = task_node._imported_and_configured(rebuild) 

1217 if new_task_node is not task_node: 

1218 updates[task_node.label] = new_task_node 

1219 self._replace_task_nodes( 

1220 updates, 

1221 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES), 

1222 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES), 

1223 message_header=( 

1224 "In task with label {task_label!r}, persisted edges (A)" 

1225 "differ from imported and configured edges (B):" 

1226 ), 

1227 ) 

1228 

1229 ########################################################################### 

1230 # 

1231 # Advanced PipelineGraph Inspection Interface: 

1232 # 

1233 # - methods to iterate over all nodes and edges, utilizing NodeKeys; 

1234 # 

1235 # - methods to find overall inputs and group nodes by their dimensions, 

1236 # which are important operations for QuantumGraph generation. 

1237 # 

1238 ########################################################################### 

1239 

1240 def iter_edges(self, init: bool = False) -> Iterator[Edge]: 

1241 """Iterate over edges in the graph. 

1242 

1243 Parameters 

1244 ---------- 

1245 init : `bool`, optional 

1246 If `True` (`False` is default) iterate over the edges between task 

1247 initialization node and init input/output dataset types, instead of 

1248 the runtime task nodes and regular input/output/prerequisite 

1249 dataset types. 

1250 

1251 Returns 

1252 ------- 

1253 edges : `~collections.abc.Iterator` [ `Edge` ] 

1254 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances. 

1255 

1256 Notes 

1257 ----- 

1258 This method always returns _either_ init edges or runtime edges, never 

1259 both. The full (internal) graph that contains both also includes a 

1260 special edge that connects each task init node to its runtime node; 

1261 that is also never returned by this method, since it is never a part of 

1262 the init-only or runtime-only subgraphs. 

1263 """ 

1264 edge: Edge 

1265 for _, _, edge in self._xgraph.edges(data="instance"): 

1266 if edge is not None and edge.is_init == init: 

1267 yield edge 

1268 

1269 def iter_nodes( 

1270 self, 

1271 ) -> Iterator[ 

1272 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode] 

1273 | tuple[Literal[NodeType.TASK], str, TaskInitNode] 

1274 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None] 

1275 ]: 

1276 """Iterate over nodes in the graph. 

1277 

1278 Returns 

1279 ------- 

1280 nodes : `~collections.abc.Iterator` [ `tuple` ] 

1281 A lazy iterator over all of the nodes in the graph. Each yielded 

1282 element is a tuple of: 

1283 

1284 - the node type enum value (`NodeType`); 

1285 - the string name for the node (task label or parent dataset type 

1286 name); 

1287 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`, 

1288 or `None` for dataset type nodes that have not been resolved). 

1289 """ 

1290 key: NodeKey 

1291 if self._sorted_keys is not None: 

1292 for key in self._sorted_keys: 

1293 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore 

1294 else: 

1295 for key, node in self._xgraph.nodes(data="instance"): 

1296 yield key.node_type, key.name, node # type: ignore 

1297 

1298 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]: 

1299 """Iterate over all of the dataset types that are consumed but not 

1300 produced by the graph. 

1301 

1302 Returns 

1303 ------- 

1304 dataset_types : `~collections.abc.Iterator` [ `tuple` ] 

1305 A lazy iterator over the overall-input dataset types (including 

1306 overall init inputs and prerequisites). Each yielded element is a 

1307 tuple of: 

1308 

1309 - the parent dataset type name; 

1310 - the resolved `DatasetTypeNode`, or `None` if the dataset type has 

1311 - not been resolved. 

1312 """ 

1313 for generation in networkx.algorithms.dag.topological_generations(self._xgraph): 

1314 key: NodeKey 

1315 for key in generation: 

1316 # While we expect all tasks to have at least one input and 

1317 # hence never appear in the first topological generation, that 

1318 # is not true of task init nodes. 

1319 if key.node_type is NodeType.DATASET_TYPE: 

1320 yield key.name, self._xgraph.nodes[key]["instance"] 

1321 return 

1322 

1323 def group_by_dimensions( 

1324 self, prerequisites: bool = False 

1325 ) -> dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]: 

1326 """Group this graph's tasks and dataset types by their dimensions. 

1327 

1328 Parameters 

1329 ---------- 

1330 prerequisites : `bool`, optional 

1331 If `True`, include prerequisite dataset types as well as regular 

1332 input and output datasets (including intermediates). 

1333 

1334 Returns 

1335 ------- 

1336 groups : `dict` [ `DimensionGraph`, `tuple` ] 

1337 A dictionary of groups keyed by `DimensionGraph`, in which each 

1338 value is a tuple of: 

1339 

1340 - a `dict` of `TaskNode` instances, keyed by task label 

1341 - a `dict` of `DatasetTypeNode` instances, keyed by 

1342 dataset type name. 

1343 

1344 that have those dimensions. 

1345 

1346 Notes 

1347 ----- 

1348 Init inputs and outputs are always included, but always have empty 

1349 dimensions and are hence are all grouped together. 

1350 """ 

1351 result: dict[DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {} 

1352 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {}) 

1353 for task_label, task_node in self.tasks.items(): 

1354 if task_node.dimensions is None: 

1355 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.") 

1356 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value: 

1357 next_new_value = ({}, {}) # make new lists for next time 

1358 group[0][task_node.label] = task_node 

1359 for dataset_type_name, dataset_type_node in self.dataset_types.items(): 

1360 if dataset_type_node is None: 

1361 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.") 

1362 if not dataset_type_node.is_prerequisite or prerequisites: 

1363 if ( 

1364 group := result.setdefault(dataset_type_node.dataset_type.dimensions, next_new_value) 

1365 ) is next_new_value: 

1366 next_new_value = ({}, {}) # make new lists for next time 

1367 group[1][dataset_type_node.name] = dataset_type_node 

1368 return result 

1369 

1370 def split_independent(self) -> Iterable[PipelineGraph]: 

1371 """Iterate over independent subgraphs that together comprise this 

1372 pipeline graph. 

1373 

1374 Returns 

1375 ------- 

1376 subgraphs : `Iterable` [ `PipelineGraph` ] 

1377 An iterable over component subgraphs that could be run 

1378 independently (they have only overall inputs in common). May be a 

1379 lazy iterator. 

1380 

1381 Notes 

1382 ----- 

1383 All resolved dataset type nodes will be preserved. 

1384 

1385 If there is only one component, ``self`` may be returned as the only 

1386 element in the iterable. 

1387 

1388 If `has_been_sorted`, all subgraphs will be sorted as well. 

1389 """ 

1390 # Having an overall input in common isn't enough to make subgraphs 

1391 # dependent on each other, so we want to look for connected component 

1392 # subgraphs of the task-only projected graph. 

1393 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False) 

1394 task_keys = { 

1395 key 

1396 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1397 if bipartite == NodeType.TASK.bipartite 

1398 } 

1399 task_xgraph = networkx.algorithms.bipartite.projected_graph( 

1400 networkx.DiGraph(bipartite_xgraph), task_keys 

1401 ) 

1402 # "Weakly" connected means connected in only one direction, which is 

1403 # the only kind of "connected" a DAG can ever be. 

1404 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph): 

1405 if component_task_keys == task_keys: 

1406 yield self 

1407 return 

1408 else: 

1409 component_subgraph = PipelineGraph(universe=self._universe) 

1410 component_subgraph.add_task_nodes( 

1411 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self 

1412 ) 

1413 if self.has_been_sorted: 

1414 component_subgraph.sort() 

1415 yield component_subgraph 

1416 

1417 ########################################################################### 

1418 # 

1419 # Class- and Package-Private Methods. 

1420 # 

1421 ########################################################################### 

1422 

1423 def _iter_task_defs(self) -> Iterator[TaskDef]: 

1424 """Iterate over this pipeline as a sequence of `TaskDef` instances. 

1425 

1426 Notes 

1427 ----- 

1428 This is a package-private method intended to aid in the transition to a 

1429 codebase more fully integrated with the `PipelineGraph` class, in which 

1430 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and 

1431 much of the functionality on the `Pipeline` class will be moved to 

1432 `PipelineGraph` as well. 

1433 

1434 Raises 

1435 ------ 

1436 TaskNotImportedError 

1437 Raised if `TaskNode.is_imported` is `False` for any task. 

1438 """ 

1439 from ..pipeline import TaskDef 

1440 

1441 for node in self._tasks.values(): 

1442 yield TaskDef( 

1443 config=node.config, 

1444 taskClass=node.task_class, 

1445 label=node.label, 

1446 connections=node._get_imported_data().connections, 

1447 ) 

1448 

1449 def _init_from_args( 

1450 self, 

1451 xgraph: networkx.MultiDiGraph | None, 

1452 sorted_keys: Sequence[NodeKey] | None, 

1453 task_subsets: dict[str, TaskSubset] | None, 

1454 description: str, 

1455 universe: DimensionUniverse | None, 

1456 data_id: DataId | None, 

1457 ) -> None: 

1458 """Initialize the graph with possibly-nontrivial arguments. 

1459 

1460 Parameters 

1461 ---------- 

1462 xgraph : `networkx.MultiDiGraph` or `None` 

1463 The backing networkx graph, or `None` to create an empty one. 

1464 This graph has `NodeKey` instances for nodes and the same structure 

1465 as the graph exported by `make_xgraph`, but its nodes and edges 

1466 have a single ``instance`` attribute that holds a `TaskNode`, 

1467 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or 

1468 `WriteEdge` instance. 

1469 sorted_keys : `Sequence` [ `NodeKey` ] or `None` 

1470 Topologically sorted sequence of node keys, or `None` if the graph 

1471 is not sorted. 

1472 task_subsets : `dict` [ `str`, `TaskSubset` ] 

1473 Labeled subsets of tasks. Values must be constructed with 

1474 ``xgraph`` as their parent graph. 

1475 description : `str` 

1476 String description for this pipeline. 

1477 universe : `lsst.daf.butler.DimensionUniverse` or `None` 

1478 Definitions of all dimensions. 

1479 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping. 

1480 Data ID that represents a constraint on all quanta generated from 

1481 this pipeline. 

1482 

1483 Notes 

1484 ----- 

1485 Only empty `PipelineGraph` instances should be constructed directly by 

1486 users, which sets the signature of ``__init__`` itself, but methods on 

1487 `PipelineGraph` and its helper classes need to be able to create them 

1488 with state. Those methods can call this after calling ``__new__`` 

1489 manually, skipping ``__init__``. 

1490 """ 

1491 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph() 

1492 self._sorted_keys: Sequence[NodeKey] | None = None 

1493 self._task_subsets = task_subsets if task_subsets is not None else {} 

1494 self._description = description 

1495 self._tasks = TaskMappingView(self._xgraph) 

1496 self._dataset_types = DatasetTypeMappingView(self._xgraph) 

1497 self._raw_data_id: dict[str, Any] 

1498 if isinstance(data_id, DataCoordinate): 

1499 if universe is None: 

1500 universe = data_id.universe 

1501 else: 

1502 assert universe is data_id.universe, "data_id.universe and given universe differ" 

1503 self._raw_data_id = data_id.byName() 

1504 elif data_id is None: 

1505 self._raw_data_id = {} 

1506 else: 

1507 self._raw_data_id = dict(data_id) 

1508 self._universe = universe 

1509 if sorted_keys is not None: 

1510 self._reorder(sorted_keys) 

1511 

1512 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph: 

1513 """Make a bipartite init-only or runtime-only internal subgraph. 

1514 

1515 See `make_bipartite_xgraph` for parameters and return values. 

1516 

1517 Notes 

1518 ----- 

1519 This method returns a view of the `PipelineGraph` object's internal 

1520 backing graph, and hence should only be called in methods that copy the 

1521 result either explicitly or by running a copying algorithm before 

1522 returning it to the user. 

1523 """ 

1524 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)]) 

1525 

1526 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G: 

1527 """Transform networkx graph attributes in-place from the internal 

1528 "instance" attributes to the documented exported attributes. 

1529 

1530 Parameters 

1531 ---------- 

1532 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1533 Graph whose state should be transformed. 

1534 skip_edges : `bool` 

1535 If `True`, do not transform edge state. 

1536 

1537 Returns 

1538 ------- 

1539 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1540 The same object passed in, after modification. 

1541 

1542 Notes 

1543 ----- 

1544 This should be called after making a copy of the internal graph but 

1545 before any projection down to just task or dataset type nodes, since 

1546 it assumes stateful edges. 

1547 """ 

1548 state: dict[str, Any] 

1549 for state in xgraph.nodes.values(): 

1550 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance") 

1551 if node_value is not None: 

1552 state.update(node_value._to_xgraph_state()) 

1553 if not skip_edges: 

1554 for _, _, state in xgraph.edges(data=True): 

1555 edge: Edge | None = state.pop("instance", None) 

1556 if edge is not None: 

1557 state.update(edge._to_xgraph_state()) 

1558 return xgraph 

1559 

1560 def _replace_task_nodes( 

1561 self, 

1562 updates: Mapping[str, TaskNode], 

1563 check_edges_unchanged: bool, 

1564 assume_edges_unchanged: bool, 

1565 message_header: str, 

1566 ) -> None: 

1567 """Replace task nodes and update edges and dataset type nodes 

1568 accordingly. 

1569 

1570 Parameters 

1571 ---------- 

1572 updates : `Mapping` [ `str`, `TaskNode` ] 

1573 New task nodes with task label keys. All keys must be task labels 

1574 that are already present in the graph. 

1575 check_edges_unchanged : `bool`, optional 

1576 If `True`, require the edges (connections) of the modified tasks to 

1577 remain unchanged after importing and configuring each task, and 

1578 verify that this is the case. 

1579 assume_edges_unchanged : `bool`, optional 

1580 If `True`, the caller declares that the edges (connections) of the 

1581 modified tasks will remain unchanged importing and configuring each 

1582 task, and that it is unnecessary to check this. 

1583 message_header : `str` 

1584 Template for `str.format` with a single ``task_label`` placeholder 

1585 to use as the first line in `EdgesChangedError` messages that show 

1586 the differences between new task edges and old task edges. Should 

1587 include the fact that the rest of the message will refer to the old 

1588 task as "A" and the new task as "B", and end with a colon. 

1589 

1590 Raises 

1591 ------ 

1592 ValueError 

1593 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

1594 are both `True`, or if a full config is provided for a task after 

1595 another full config or an override has already been provided. 

1596 EdgesChangedError 

1597 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

1598 change. 

1599 """ 

1600 deep: dict[str, TaskNode] = {} 

1601 shallow: dict[str, TaskNode] = {} 

1602 if assume_edges_unchanged: 

1603 if check_edges_unchanged: 

1604 raise ValueError("Cannot simultaneously assume and check that edges have not changed.") 

1605 shallow.update(updates) 

1606 else: 

1607 for task_label, new_task_node in updates.items(): 

1608 old_task_node = self.tasks[task_label] 

1609 messages = old_task_node.diff_edges(new_task_node) 

1610 if messages: 

1611 if check_edges_unchanged: 

1612 messages.insert(0, message_header.format(task_label=task_label)) 

1613 raise EdgesChangedError("\n".join(messages)) 

1614 else: 

1615 deep[task_label] = new_task_node 

1616 else: 

1617 shallow[task_label] = new_task_node 

1618 try: 

1619 if deep: 

1620 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True) 

1621 self.add_task_nodes(deep.values()) 

1622 for replaced_task_node, referencing_subsets in removed: 

1623 for subset_label in referencing_subsets: 

1624 self._task_subsets[subset_label].add(replaced_task_node.label) 

1625 for task_node in shallow.values(): 

1626 self._xgraph.nodes[task_node.key]["instance"] = task_node 

1627 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init 

1628 except PipelineGraphExceptionSafetyError: # pragma: no cover 

1629 raise 

1630 except Exception as err: # pragma: no cover 

1631 # There's no known way to get here, but we want to make it clear 

1632 # it's a big problem if we do. 

1633 raise PipelineGraphExceptionSafetyError( 

1634 "Error while replacing tasks has left the graph in an inconsistent state." 

1635 ) from err 

1636 

1637 def _append_graph_data_from_edge( 

1638 self, 

1639 node_data: list[tuple[NodeKey, dict[str, Any]]], 

1640 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]], 

1641 edge: Edge, 

1642 parent: PipelineGraph | None, 

1643 ) -> None: 

1644 """Append networkx state dictionaries for an edge and the corresponding 

1645 dataset type node. 

1646 

1647 Parameters 

1648 ---------- 

1649 node_data : `list` 

1650 List of node keys and state dictionaries. A node is appended if 

1651 one does not already exist for this dataset type. 

1652 edge_data : `list` 

1653 List of node key pairs, connection names, and state dictionaries 

1654 for edges. 

1655 edge : `Edge` 

1656 New edge being processed. 

1657 parent : `PipelineGraph` or `None` 

1658 Another pipeline graph whose dataset type nodes should be used 

1659 when present. 

1660 """ 

1661 new_dataset_type_node = None 

1662 if parent is not None: 

1663 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance") 

1664 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None: 

1665 existing_dataset_type_state["instance"] = new_dataset_type_node 

1666 else: 

1667 node_data.append( 

1668 ( 

1669 edge.dataset_type_key, 

1670 { 

1671 "instance": new_dataset_type_node, 

1672 "bipartite": NodeType.DATASET_TYPE.bipartite, 

1673 }, 

1674 ) 

1675 ) 

1676 edge_data.append( 

1677 edge.nodes 

1678 + ( 

1679 edge.connection_name, 

1680 {"instance": edge}, 

1681 ) 

1682 ) 

1683 

1684 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None: 

1685 """Set the order of all views of this graph from the given sorted 

1686 sequence of task labels and dataset type names. 

1687 """ 

1688 self._sorted_keys = sorted_keys 

1689 self._tasks._reorder(sorted_keys) 

1690 self._dataset_types._reorder(sorted_keys) 

1691 

1692 def _reset(self) -> None: 

1693 """Reset the all views of this graph following a modification that 

1694 might invalidate them. 

1695 """ 

1696 self._sorted_keys = None 

1697 self._tasks._reset() 

1698 self._dataset_types._reset() 

1699 

1700 _xgraph: networkx.MultiDiGraph 

1701 _sorted_keys: Sequence[NodeKey] | None 

1702 _task_subsets: dict[str, TaskSubset] 

1703 _description: str 

1704 _tasks: TaskMappingView 

1705 _dataset_types: DatasetTypeMappingView 

1706 _raw_data_id: dict[str, Any] 

1707 _universe: DimensionUniverse | None