Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 19%

408 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-11 10:50 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("PipelineGraph",) 

30 

31import gzip 

32import itertools 

33import json 

34from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence, Set 

35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast 

36 

37import networkx 

38import networkx.algorithms.bipartite 

39import networkx.algorithms.dag 

40from lsst.daf.butler import DataCoordinate, DataId, DatasetType, DimensionGroup, DimensionUniverse, Registry 

41from lsst.daf.butler.registry import MissingDatasetTypeError 

42from lsst.resources import ResourcePath, ResourcePathExpression 

43 

44from ._dataset_types import DatasetTypeNode 

45from ._edges import Edge, ReadEdge, WriteEdge 

46from ._exceptions import ( 

47 DuplicateOutputError, 

48 EdgesChangedError, 

49 PipelineDataCycleError, 

50 PipelineGraphError, 

51 PipelineGraphExceptionSafetyError, 

52 UnresolvedGraphError, 

53) 

54from ._mapping_views import DatasetTypeMappingView, TaskMappingView 

55from ._nodes import NodeKey, NodeType 

56from ._task_subsets import TaskSubset 

57from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData 

58 

59if TYPE_CHECKING: 

60 from ..config import PipelineTaskConfig 

61 from ..connections import PipelineTaskConnections 

62 from ..pipeline import TaskDef 

63 from ..pipelineTask import PipelineTask 

64 

65 

66_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph) 

67 

68 

69class PipelineGraph: 

70 """A graph representation of fully-configured pipeline. 

71 

72 `PipelineGraph` instances are typically constructed by calling 

73 `.Pipeline.to_graph`, but in rare cases constructing and then populating an 

74 empty one may be preferable. 

75 

76 Parameters 

77 ---------- 

78 description : `str`, optional 

79 String description for this pipeline. 

80 universe : `lsst.daf.butler.DimensionUniverse`, optional 

81 Definitions for all butler dimensions. If not provided, some 

82 attributes will not be available until `resolve` is called. 

83 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional 

84 Data ID that represents a constraint on all quanta generated by this 

85 pipeline. This typically just holds the instrument constraint included 

86 in the pipeline definition, if there was one. 

87 """ 

88 

89 ########################################################################### 

90 # 

91 # Simple Pipeline Graph Inspection Interface: 

92 # 

93 # - for inspecting graph structure, not modifying it (except to sort and] 

94 # resolve); 

95 # 

96 # - no NodeKey objects, just string dataset type name and task label keys; 

97 # 

98 # - graph structure is represented as a pair of mappings, with methods to 

99 # find neighbors and edges of nodes. 

100 # 

101 ########################################################################### 

102 

103 def __init__( 

104 self, 

105 *, 

106 description: str = "", 

107 universe: DimensionUniverse | None = None, 

108 data_id: DataId | None = None, 

109 ) -> None: 

110 self._init_from_args( 

111 xgraph=None, 

112 sorted_keys=None, 

113 task_subsets=None, 

114 description=description, 

115 universe=universe, 

116 data_id=data_id, 

117 ) 

118 

119 def __repr__(self) -> str: 

120 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})" 

121 

122 @property 

123 def description(self) -> str: 

124 """String description for this pipeline.""" 

125 return self._description 

126 

127 @description.setter 

128 def description(self, value: str) -> None: 

129 # Docstring in setter. 

130 self._description = value 

131 

132 @property 

133 def universe(self) -> DimensionUniverse | None: 

134 """Definitions for all butler dimensions.""" 

135 return self._universe 

136 

137 @property 

138 def data_id(self) -> DataCoordinate: 

139 """Data ID that represents a constraint on all quanta generated from 

140 this pipeline. 

141 

142 This is may not be available unless `universe` is not `None`. 

143 """ 

144 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe) 

145 

146 @property 

147 def tasks(self) -> TaskMappingView: 

148 """A mapping view of the tasks in the graph. 

149 

150 This mapping has `str` task label keys and `TaskNode` values. Iteration 

151 is topologically and deterministically ordered if and only if `sort` 

152 has been called since the last modification to the graph. 

153 """ 

154 return self._tasks 

155 

156 @property 

157 def dataset_types(self) -> DatasetTypeMappingView: 

158 """A mapping view of the dataset types in the graph. 

159 

160 This mapping has `str` parent dataset type name keys, but only provides 

161 access to its `DatasetTypeNode` values if `resolve` has been called 

162 since the last modification involving a task that uses a dataset type. 

163 See `DatasetTypeMappingView` for details. 

164 """ 

165 return self._dataset_types 

166 

167 @property 

168 def task_subsets(self) -> Mapping[str, TaskSubset]: 

169 """A mapping of all labeled subsets of tasks. 

170 

171 Keys are subset labels, values are sets of task labels. See 

172 `TaskSubset` for more information. 

173 

174 Use `add_task_subset` to add a new subset. The subsets themselves may 

175 be modified in-place. 

176 """ 

177 return self._task_subsets 

178 

179 @property 

180 def is_fully_resolved(self) -> bool: 

181 """Whether all of this graph's nodes are resolved.""" 

182 return self._universe is not None and all( 

183 self.dataset_types.is_resolved(k) for k in self.dataset_types 

184 ) 

185 

186 @property 

187 def is_sorted(self) -> bool: 

188 """Whether this graph's tasks and dataset types are topologically 

189 sorted with the exact same deterministic tiebreakers that `sort` would 

190 apply. 

191 

192 This may perform (and then discard) a full sort if `has_been_sorted` is 

193 `False`. If the goal is to obtain a sorted graph, it is better to just 

194 call `sort` without guarding that with an ``if not graph.is_sorted`` 

195 check. 

196 """ 

197 if self._sorted_keys is not None: 

198 return True 

199 return all( 

200 sorted == unsorted 

201 for sorted, unsorted in zip( 

202 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True 

203 ) 

204 ) 

205 

206 @property 

207 def has_been_sorted(self) -> bool: 

208 """Whether this graph's tasks and dataset types have been 

209 topologically sorted (with unspecified but deterministic tiebreakers) 

210 since the last modification to the graph. 

211 

212 This may return `False` if the graph *happens* to be sorted but `sort` 

213 was never called, but it is potentially much faster than `is_sorted`, 

214 which may attempt (and then discard) a full sort if `has_been_sorted` 

215 is `False`. 

216 """ 

217 return self._sorted_keys is not None 

218 

219 def sort(self) -> None: 

220 """Sort this graph's nodes topologically with deterministic (but 

221 unspecified) tiebreakers. 

222 

223 This does nothing if the graph is already known to be sorted. 

224 """ 

225 if self._sorted_keys is None: 

226 try: 

227 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph)) 

228 except networkx.NetworkXUnfeasible as err: # pragma: no cover 

229 # Should't be possible to get here, because we check for cycles 

230 # when adding tasks, but we guard against it anyway. 

231 cycle = networkx.find_cycle(self._xgraph) 

232 raise PipelineDataCycleError( 

233 f"Cycle detected while attempting to sort graph: {cycle}." 

234 ) from err 

235 self._reorder(sorted_keys) 

236 

237 def copy(self) -> PipelineGraph: 

238 """Return a copy of this graph that copies all mutable state.""" 

239 xgraph = self._xgraph.copy() 

240 result = PipelineGraph.__new__(PipelineGraph) 

241 result._init_from_args( 

242 xgraph, 

243 self._sorted_keys, 

244 task_subsets={ 

245 k: TaskSubset(xgraph, v.label, set(v._members), v.description) 

246 for k, v in self._task_subsets.items() 

247 }, 

248 description=self._description, 

249 universe=self.universe, 

250 data_id=self._raw_data_id, 

251 ) 

252 return result 

253 

254 def __copy__(self) -> PipelineGraph: 

255 # Fully shallow copies are dangerous; we don't want shared mutable 

256 # state to lead to broken class invariants. 

257 return self.copy() 

258 

259 def __deepcopy__(self, memo: dict) -> PipelineGraph: 

260 # Genuine deep copies are unnecessary, since we should only ever care 

261 # that mutable state is copied. 

262 return self.copy() 

263 

264 def diff_tasks(self, other: PipelineGraph) -> list[str]: 

265 """Compare two pipeline graphs. 

266 

267 This only compares graph structure and task classes (including their 

268 edges). It does *not* compare full configuration (which is subject to 

269 spurious differences due to import-cache state), dataset type 

270 resolutions, or sort state. 

271 

272 Parameters 

273 ---------- 

274 other : `PipelineGraph` 

275 Graph to compare to. 

276 

277 Returns 

278 ------- 

279 differences : `list` [ `str` ] 

280 List of string messages describing differences between the 

281 pipelines. If empty, the graphs have the same tasks and 

282 connections. 

283 """ 

284 messages: list[str] = [] 

285 common_labels: Set[str] 

286 if self.tasks.keys() != other.tasks.keys(): 

287 common_labels = self.tasks.keys() & other.tasks.keys() 

288 messages.append( 

289 f"Pipelines have different tasks: A & ~B = {list(self.tasks.keys() - common_labels)}, " 

290 f"B & ~A = {list(other.tasks.keys() - common_labels)}." 

291 ) 

292 else: 

293 common_labels = self.tasks.keys() 

294 for label in common_labels: 

295 a = self.tasks[label] 

296 b = other.tasks[label] 

297 if a.task_class != b.task_class: 

298 messages.append( 

299 f"Task {label!r} has class {a.task_class_name} in A, " f"but {b.task_class_name} in B." 

300 ) 

301 messages.extend(a.diff_edges(b)) 

302 return messages 

303 

304 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None: 

305 """Return the `WriteEdge` that links the producing task to the named 

306 dataset type. 

307 

308 Parameters 

309 ---------- 

310 dataset_type_name : `str` 

311 Dataset type name. Must not be a component. 

312 

313 Returns 

314 ------- 

315 edge : `WriteEdge` or `None` 

316 Producing edge or `None` if there isn't one in this graph. 

317 

318 Raises 

319 ------ 

320 DuplicateOutputError 

321 Raised if there are multiple tasks defined to produce this dataset 

322 type. This is only possible if the graph's dataset types are not 

323 resolved. 

324 

325 Notes 

326 ----- 

327 On resolved graphs, it may be slightly more efficient to use:: 

328 

329 graph.dataset_types[dataset_type_name].producing_edge 

330 

331 but this method works on graphs with unresolved dataset types as well. 

332 """ 

333 producer: str | None = None 

334 producing_edge: WriteEdge | None = None 

335 for _, _, producing_edge in self._xgraph.in_edges( 

336 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

337 ): 

338 assert producing_edge is not None, "Should only be None if we never loop." 

339 if producer is not None: 

340 raise DuplicateOutputError( 

341 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} " 

342 f"and {producer!r}." 

343 ) 

344 return producing_edge 

345 

346 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]: 

347 """Return the `ReadEdge` objects that link the named dataset type to 

348 the tasks that consume it. 

349 

350 Parameters 

351 ---------- 

352 dataset_type_name : `str` 

353 Dataset type name. Must not be a component. 

354 

355 Returns 

356 ------- 

357 edges : `list` [ `ReadEdge` ] 

358 Edges that connect this dataset type to the tasks that consume it. 

359 

360 Notes 

361 ----- 

362 On resolved graphs, it may be slightly more efficient to use:: 

363 

364 graph.dataset_types[dataset_type_name].producing_edges 

365 

366 but this method works on graphs with unresolved dataset types as well. 

367 """ 

368 return [ 

369 edge 

370 for _, _, edge in self._xgraph.out_edges( 

371 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

372 ) 

373 ] 

374 

375 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None: 

376 """Return the `TaskNode` or `TaskInitNode` that writes the given 

377 dataset type. 

378 

379 Parameters 

380 ---------- 

381 dataset_type_name : `str` 

382 Dataset type name. Must not be a component. 

383 

384 Returns 

385 ------- 

386 edge : `TaskNode`, `TaskInitNode`, or `None` 

387 Producing node or `None` if there isn't one in this graph. 

388 

389 Raises 

390 ------ 

391 DuplicateOutputError 

392 Raised if there are multiple tasks defined to produce this dataset 

393 type. This is only possible if the graph's dataset types are not 

394 resolved. 

395 """ 

396 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None: 

397 return self._xgraph.nodes[producing_edge.task_key]["instance"] 

398 return None 

399 

400 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]: 

401 """Return the `TaskNode` and/or `TaskInitNode` objects that read 

402 the given dataset type. 

403 

404 Parameters 

405 ---------- 

406 dataset_type_name : `str` 

407 Dataset type name. Must not be a component. 

408 

409 Returns 

410 ------- 

411 edges : `list` [ `ReadEdge` ] 

412 Edges that connect this dataset type to the tasks that consume it. 

413 

414 Notes 

415 ----- 

416 On resolved graphs, it may be slightly more efficient to use:: 

417 

418 graph.dataset_types[dataset_type_name].producing_edges 

419 

420 but this method works on graphs with unresolved dataset types as well. 

421 """ 

422 return [ 

423 self._xgraph.nodes[consuming_edge.task_key]["instance"] 

424 for consuming_edge in self.consuming_edges_of(dataset_type_name) 

425 ] 

426 

427 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]: 

428 """Return the dataset types that are inputs to a task. 

429 

430 Parameters 

431 ---------- 

432 task_label : `str` 

433 Label for the task in the pipeline. 

434 init : `bool`, optional 

435 If `True`, return init-input dataset types instead of runtime 

436 (including prerequisite) inputs. 

437 

438 Returns 

439 ------- 

440 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

441 Dictionary parent dataset type name keys and either 

442 `DatasetTypeNode` values (if the dataset type has been resolved) 

443 or `None` values. 

444 

445 Notes 

446 ----- 

447 To get the input edges of a task or task init node (which provide 

448 information about storage class overrides nd components) use:: 

449 

450 graph.tasks[task_label].iter_all_inputs() 

451 

452 or 

453 

454 graph.tasks[task_label].init.iter_all_inputs() 

455 

456 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

457 class. 

458 """ 

459 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

460 return { 

461 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

462 for edge in node.iter_all_inputs() 

463 } 

464 

465 def outputs_of( 

466 self, task_label: str, init: bool = False, include_automatic_connections: bool = True 

467 ) -> dict[str, DatasetTypeNode | None]: 

468 """Return the dataset types that are outputs of a task. 

469 

470 Parameters 

471 ---------- 

472 task_label : `str` 

473 Label for the task in the pipeline. 

474 init : `bool`, optional 

475 If `True`, return init-output dataset types instead of runtime 

476 outputs. 

477 include_automatic_connections : `bool`, optional 

478 Whether to include automatic connections such as configs, metadata, 

479 and logs. 

480 

481 Returns 

482 ------- 

483 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

484 Dictionary parent dataset type name keys and either 

485 `DatasetTypeNode` values (if the dataset type has been resolved) 

486 or `None` values. 

487 

488 Notes 

489 ----- 

490 To get the input edges of a task or task init node (which provide 

491 information about storage class overrides nd components) use:: 

492 

493 graph.tasks[task_label].iter_all_outputs() 

494 

495 or 

496 

497 graph.tasks[task_label].init.iter_all_outputs() 

498 

499 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

500 class. 

501 """ 

502 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

503 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values() 

504 return { 

505 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

506 for edge in iterable 

507 } 

508 

509 def resolve( 

510 self, 

511 registry: Registry | None = None, 

512 dimensions: DimensionUniverse | None = None, 

513 dataset_types: Mapping[str, DatasetType] | None = None, 

514 ) -> None: 

515 """Resolve all dimensions and dataset types and check them for 

516 consistency. 

517 

518 Resolving a graph also causes it to be sorted. 

519 

520 Parameters 

521 ---------- 

522 registry : `lsst.daf.butler.Registry`, optional 

523 Client for the data repository to resolve against. If not 

524 provided, both ``dimensions`` and ``dataset_types`` must be. 

525 dimensions : `lsst.daf.butler.DimensionUniverse`, optional 

526 Definitions for all dimensions. 

527 dataset_types : `~collection.abc.Mapping` [ `str`, \ 

528 `~lsst.daf.butler.DatasetType` ], optional 

529 Mapping of dataset types to consider registered. 

530 

531 Notes 

532 ----- 

533 The `universe` attribute is set to ``dimensions`` and used to set all 

534 `TaskNode.dimensions` attributes. Dataset type nodes are resolved by 

535 first looking for a registry definition, then using the producing 

536 task's definition, then looking for consistency between all consuming 

537 task definitions. 

538 

539 Raises 

540 ------ 

541 ConnectionTypeConsistencyError 

542 Raised if a prerequisite input for one task appears as a different 

543 kind of connection in any other task. 

544 DuplicateOutputError 

545 Raised if multiple tasks have the same dataset type as an output. 

546 IncompatibleDatasetTypeError 

547 Raised if different tasks have different definitions of a dataset 

548 type. Different but compatible storage classes are permitted. 

549 MissingDatasetTypeError 

550 Raised if a dataset type definition is required to exist in the 

551 data repository but none was found. This should only occur for 

552 dataset types that are not produced by a task in the pipeline and 

553 are consumed with different storage classes or as components by 

554 tasks in the pipeline. 

555 EdgesChangedError 

556 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

557 change after import and reconfiguration. 

558 """ 

559 if registry is None and (dimensions is None or dataset_types is None): 

560 raise PipelineGraphError( 

561 "Either 'registry' or both 'dimensions' and 'dataset_types' " 

562 "must be passed to PipelineGraph.resolve." 

563 ) 

564 

565 get_registered: Callable[[str], DatasetType | None] 

566 if dataset_types is not None: 

567 # Ruff seems confused about whether this is used below; it is! 

568 get_registered = dataset_types.get 

569 else: 

570 assert registry is not None 

571 

572 def get_registered(name: str) -> DatasetType | None: 

573 try: 

574 return registry.getDatasetType(name) 

575 except MissingDatasetTypeError: 

576 return None 

577 

578 if dimensions is None: 

579 assert registry is not None 

580 dimensions = registry.dimensions 

581 

582 node_key: NodeKey 

583 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {} 

584 for node_key, node_state in self._xgraph.nodes.items(): 

585 match node_key.node_type: 

586 case NodeType.TASK: 

587 task_node: TaskNode = node_state["instance"] 

588 new_task_node = task_node._resolved(dimensions) 

589 if new_task_node is not task_node: 

590 updates[node_key] = new_task_node 

591 case NodeType.DATASET_TYPE: 

592 dataset_type_node: DatasetTypeNode | None = node_state["instance"] 

593 new_dataset_type_node = DatasetTypeNode._from_edges( 

594 node_key, self._xgraph, get_registered, dimensions, previous=dataset_type_node 

595 ) 

596 # Usage of `is`` here is intentional; `_from_edges` returns 

597 # `previous=dataset_type_node` if it can determine that it 

598 # doesn't need to change. 

599 if new_dataset_type_node is not dataset_type_node: 

600 updates[node_key] = new_dataset_type_node 

601 try: 

602 for node_key, node_value in updates.items(): 

603 self._xgraph.nodes[node_key]["instance"] = node_value 

604 except Exception as err: # pragma: no cover 

605 # There's no known way to get here, but we want to make it 

606 # clear it's a big problem if we do. 

607 raise PipelineGraphExceptionSafetyError( 

608 "Error during dataset type resolution has left the graph in an inconsistent state." 

609 ) from err 

610 self.sort() 

611 self._universe = dimensions 

612 

613 ########################################################################### 

614 # 

615 # Graph Modification Interface: 

616 # 

617 # - methods to add, remove, and replace tasks; 

618 # 

619 # - methods to add and remove task subsets. 

620 # 

621 # These are all things that are usually done in a Pipeline before making a 

622 # graph at all, but there may be cases where we want to modify the graph 

623 # instead. (These are also the methods used to make a graph from a 

624 # Pipeline, or make a graph from another graph.) 

625 # 

626 ########################################################################### 

627 

628 def add_task( 

629 self, 

630 label: str | None, 

631 task_class: type[PipelineTask], 

632 config: PipelineTaskConfig | None = None, 

633 connections: PipelineTaskConnections | None = None, 

634 ) -> TaskNode: 

635 """Add a new task to the graph. 

636 

637 Parameters 

638 ---------- 

639 label : `str` or `None` 

640 Label for the task in the pipeline. If `None`, `Task._DefaultName` 

641 is used. 

642 task_class : `type` [ `PipelineTask` ] 

643 Class object for the task. 

644 config : `PipelineTaskConfig`, optional 

645 Configuration for the task. If not provided, a default-constructed 

646 instance of ``task_class.ConfigClass`` is used. 

647 connections : `PipelineTaskConnections`, optional 

648 Object that describes the dataset types used by the task. If not 

649 provided, one will be constructed from the given configuration. If 

650 provided, it is assumed that ``config`` has already been validated 

651 and frozen. 

652 

653 Returns 

654 ------- 

655 node : `TaskNode` 

656 The new task node added to the graph. 

657 

658 Raises 

659 ------ 

660 ValueError 

661 Raised if configuration validation failed when constructing 

662 ``connections``. 

663 PipelineDataCycleError 

664 Raised if the graph is cyclic after this addition. 

665 RuntimeError 

666 Raised if an unexpected exception (which will be chained) occurred 

667 at a stage that may have left the graph in an inconsistent state. 

668 Other exceptions should leave the graph unchanged. 

669 

670 Notes 

671 ----- 

672 Checks for dataset type consistency and multiple producers do not occur 

673 until `resolve` is called, since the resolution depends on both the 

674 state of the data repository and all contributing tasks. 

675 

676 Adding new tasks removes any existing resolutions of all dataset types 

677 it references and marks the graph as unsorted. It is most effiecient 

678 to add all tasks up front and only then resolve and/or sort the graph. 

679 """ 

680 if label is None: 

681 label = task_class._DefaultName 

682 if config is None: 

683 config = task_class.ConfigClass() 

684 task_node = TaskNode._from_imported_data( 

685 key=NodeKey(NodeType.TASK, label), 

686 init_key=NodeKey(NodeType.TASK_INIT, label), 

687 data=_TaskNodeImportedData.configure(label, task_class, config, connections), 

688 universe=self.universe, 

689 ) 

690 self.add_task_nodes([task_node]) 

691 return task_node 

692 

693 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None: 

694 """Add one or more existing task nodes to the graph. 

695 

696 Parameters 

697 ---------- 

698 nodes : `~collections.abc.Iterable` [ `TaskNode` ] 

699 Iterable of task nodes to add. If any tasks have resolved 

700 dimensions, they must have the same dimension universe as the rest 

701 of the graph. 

702 parent : `PipelineGraph`, optional 

703 If provided, another `PipelineGraph` from which these nodes were 

704 obtained. Any dataset type nodes already present in ``parent`` 

705 that are referenced by the given tasks will be used in this graph 

706 if they are not already present, preserving any dataset type 

707 resolutions present in the parent graph. Adding nodes from a 

708 parent graph after the graph has its own nodes (e.g. from 

709 `add_task`) or nodes from a third graph may result in invalid 

710 dataset type resolutions. It is safest to only use this argument 

711 when populating an empty graph for the first time. 

712 

713 Raises 

714 ------ 

715 PipelineDataCycleError 

716 Raised if the graph is cyclic after this addition. 

717 

718 Notes 

719 ----- 

720 Checks for dataset type consistency and multiple producers do not occur 

721 until `resolve` is called, since the resolution depends on both the 

722 state of the data repository and all contributing tasks. 

723 

724 Adding new tasks removes any existing resolutions of all dataset types 

725 it references (unless ``parent is not None`` and marks the graph as 

726 unsorted. It is most efficient to add all tasks up front and only then 

727 resolve and/or sort the graph. 

728 """ 

729 node_data: list[tuple[NodeKey, dict[str, Any]]] = [] 

730 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = [] 

731 for task_node in nodes: 

732 task_node = task_node._resolved(self._universe) 

733 node_data.append( 

734 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite}) 

735 ) 

736 node_data.append( 

737 ( 

738 task_node.init.key, 

739 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite}, 

740 ) 

741 ) 

742 # Convert the edge objects attached to the task node to networkx. 

743 for read_edge in task_node.init.iter_all_inputs(): 

744 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

745 for write_edge in task_node.init.iter_all_outputs(): 

746 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

747 for read_edge in task_node.iter_all_inputs(): 

748 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

749 for write_edge in task_node.iter_all_outputs(): 

750 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

751 # Add a special edge (with no Edge instance) that connects the 

752 # TaskInitNode to the runtime TaskNode. 

753 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None})) 

754 if not node_data and not edge_data: 

755 return 

756 # Checks and preparation complete; time to start the actual 

757 # modification, during which it's hard to provide strong exception 

758 # safety. Start by resetting the sort ordering, if there is one. 

759 self._reset() 

760 try: 

761 self._xgraph.add_nodes_from(node_data) 

762 self._xgraph.add_edges_from(edge_data) 

763 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph): 

764 cycle = networkx.find_cycle(self._xgraph) 

765 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.") 

766 except Exception: 

767 # First try to roll back our changes. 

768 try: 

769 self._xgraph.remove_edges_from(edge_data) 

770 self._xgraph.remove_nodes_from(key for key, _ in node_data) 

771 except Exception as err: # pragma: no cover 

772 # There's no known way to get here, but we want to make it 

773 # clear it's a big problem if we do. 

774 raise PipelineGraphExceptionSafetyError( 

775 "Error while attempting to revert PipelineGraph modification has left the graph in " 

776 "an inconsistent state." 

777 ) from err 

778 # Successfully rolled back; raise the original exception. 

779 raise 

780 

781 def reconfigure_tasks( 

782 self, 

783 *args: tuple[str, PipelineTaskConfig], 

784 check_edges_unchanged: bool = False, 

785 assume_edges_unchanged: bool = False, 

786 **kwargs: PipelineTaskConfig, 

787 ) -> None: 

788 """Update the configuration for one or more tasks. 

789 

790 Parameters 

791 ---------- 

792 *args : `tuple` [ `str`, `.PipelineTaskConfig` ] 

793 Positional arguments are each a 2-tuple of task label and new 

794 config object. Note that the same arguments may also be passed as 

795 ``**kwargs``, which is usually more readable, but task labels in 

796 ``*args`` are not required to be valid Python identifiers. 

797 check_edges_unchanged : `bool`, optional 

798 If `True`, require the edges (connections) of the modified tasks to 

799 remain unchanged after the configuration updates, and verify that 

800 this is the case. 

801 assume_edges_unchanged : `bool`, optional 

802 If `True`, the caller declares that the edges (connections) of the 

803 modified tasks will remain unchanged after the configuration 

804 updates, and that it is unnecessary to check this. 

805 **kwargs : `.PipelineTaskConfig` 

806 New config objects or overrides to apply to copies of the current 

807 config objects, with task labels as the keywords. 

808 

809 Raises 

810 ------ 

811 ValueError 

812 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

813 are both `True`, or if the same task appears twice. 

814 EdgesChangedError 

815 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

816 change. 

817 

818 Notes 

819 ----- 

820 If reconfiguring a task causes its edges to change, any dataset type 

821 nodes connected to that task (not just those whose edges have changed!) 

822 will be unresolved. 

823 """ 

824 new_configs: dict[str, PipelineTaskConfig] = {} 

825 for task_label, config_update in itertools.chain(args, kwargs.items()): 

826 if new_configs.setdefault(task_label, config_update) is not config_update: 

827 raise ValueError(f"Config for {task_label!r} provided more than once.") 

828 updates = { 

829 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged) 

830 for task_label, config in new_configs.items() 

831 } 

832 self._replace_task_nodes( 

833 updates, 

834 check_edges_unchanged=check_edges_unchanged, 

835 assume_edges_unchanged=assume_edges_unchanged, 

836 message_header=( 

837 "Unexpected change in edges for task {task_label!r} from original config (A) to " 

838 "new configs (B):" 

839 ), 

840 ) 

841 

842 def remove_tasks( 

843 self, labels: Iterable[str], drop_from_subsets: bool = True 

844 ) -> list[tuple[TaskNode, set[str]]]: 

845 """Remove one or more tasks from the graph. 

846 

847 Parameters 

848 ---------- 

849 labels : `~collections.abc.Iterable` [ `str` ] 

850 Iterable of the labels of the tasks to remove. 

851 drop_from_subsets : `bool`, optional 

852 If `True`, drop each removed task from any subset in which it 

853 currently appears. If `False`, raise `PipelineGraphError` if any 

854 such subsets exist. 

855 

856 Returns 

857 ------- 

858 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ] 

859 List of nodes removed and the labels of task subsets that 

860 referenced them. 

861 

862 Raises 

863 ------ 

864 PipelineGraphError 

865 Raised if ``drop_from_subsets`` is `False` and the task is still 

866 part of one or more subsets. 

867 

868 Notes 

869 ----- 

870 Removing a task will cause dataset nodes with no other referencing 

871 tasks to be removed. Any other dataset type nodes referenced by a 

872 removed task will be reset to an "unresolved" state. 

873 """ 

874 task_nodes_and_subsets = [] 

875 dataset_types: set[NodeKey] = set() 

876 nodes_to_remove = set() 

877 for label in labels: 

878 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"] 

879 # Find task subsets that reference this task. 

880 referencing_subsets = { 

881 subset_label 

882 for subset_label, task_subset in self.task_subsets.items() 

883 if label in task_subset 

884 } 

885 if not drop_from_subsets and referencing_subsets: 

886 raise PipelineGraphError( 

887 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}." 

888 ) 

889 task_nodes_and_subsets.append((task_node, referencing_subsets)) 

890 # Find dataset types referenced by this task. 

891 dataset_types.update(self._xgraph.predecessors(task_node.key)) 

892 dataset_types.update(self._xgraph.successors(task_node.key)) 

893 dataset_types.update(self._xgraph.predecessors(task_node.init.key)) 

894 dataset_types.update(self._xgraph.successors(task_node.init.key)) 

895 # Since there's an edge between the task and its init node, we'll 

896 # have added those two nodes here, too, and we don't want that. 

897 dataset_types.remove(task_node.init.key) 

898 dataset_types.remove(task_node.key) 

899 # Mark the task node and its init node for removal from the graph. 

900 nodes_to_remove.add(task_node.key) 

901 nodes_to_remove.add(task_node.init.key) 

902 # Process the referenced datasets to see which ones are orphaned and 

903 # need to be removed vs. just unresolved. 

904 nodes_to_unresolve = [] 

905 for dataset_type_key in dataset_types: 

906 related_tasks = set() 

907 related_tasks.update(self._xgraph.predecessors(dataset_type_key)) 

908 related_tasks.update(self._xgraph.successors(dataset_type_key)) 

909 related_tasks.difference_update(nodes_to_remove) 

910 if not related_tasks: 

911 nodes_to_remove.add(dataset_type_key) 

912 else: 

913 nodes_to_unresolve.append(dataset_type_key) 

914 # Checks and preparation complete; time to start the actual 

915 # modification, during which it's hard to provide strong exception 

916 # safety. Start by resetting the sort ordering. 

917 self._reset() 

918 try: 

919 for dataset_type_key in nodes_to_unresolve: 

920 self._xgraph.nodes[dataset_type_key]["instance"] = None 

921 for task_node, referencing_subsets in task_nodes_and_subsets: 

922 for subset_label in referencing_subsets: 

923 self._task_subsets[subset_label].remove(task_node.label) 

924 self._xgraph.remove_nodes_from(nodes_to_remove) 

925 except Exception as err: # pragma: no cover 

926 # There's no known way to get here, but we want to make it 

927 # clear it's a big problem if we do. 

928 raise PipelineGraphExceptionSafetyError( 

929 "Error during task removal has left the graph in an inconsistent state." 

930 ) from err 

931 return task_nodes_and_subsets 

932 

933 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None: 

934 """Add a label for a set of tasks that are already in the pipeline. 

935 

936 Parameters 

937 ---------- 

938 subset_label : `str` 

939 Label for this set of tasks. 

940 task_labels : `~collections.abc.Iterable` [ `str` ] 

941 Labels of the tasks to include in the set. All must already be 

942 included in the graph. 

943 description : `str`, optional 

944 String description to associate with this label. 

945 """ 

946 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description) 

947 self._task_subsets[subset_label] = subset 

948 

949 def remove_task_subset(self, subset_label: str) -> None: 

950 """Remove a labeled set of tasks. 

951 

952 Parameters 

953 ---------- 

954 subset_label : `str` 

955 Label for this set of tasks. 

956 """ 

957 del self._task_subsets[subset_label] 

958 

959 ########################################################################### 

960 # 

961 # NetworkX Export Interface: 

962 # 

963 # - methods to export the PipelineGraph's content (or various subsets 

964 # thereof) as NetworkX objects. 

965 # 

966 # These are particularly useful when writing tools to visualize the graph, 

967 # while providing options for which aspects of the graph (tasks, dataset 

968 # types, or both) to include, since all exported graphs have similar 

969 # attributes regardless of their structure. 

970 # 

971 ########################################################################### 

972 

973 def make_xgraph(self) -> networkx.MultiDiGraph: 

974 """Export a networkx representation of the full pipeline graph, 

975 including both init and runtime edges. 

976 

977 Returns 

978 ------- 

979 xgraph : `networkx.MultiDiGraph` 

980 Directed acyclic graph with parallel edges. 

981 

982 Notes 

983 ----- 

984 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

985 represent the same dataset type appearing in multiple connections for 

986 the same task, and are hence rare. The connection name is used as the 

987 edge key to disambiguate those parallel edges. 

988 

989 Almost all edges connect dataset type nodes to task or task init nodes 

990 or vice versa, but there is also a special edge that connects each task 

991 init node to its runtime node. The existence of these edges makes the 

992 graph not quite bipartite, though its init-only and runtime-only 

993 subgraphs are bipartite. 

994 

995 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

996 `WriteEdge` for the descriptive node and edge attributes added. 

997 """ 

998 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False) 

999 

1000 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph: 

1001 """Return a bipartite networkx representation of just the runtime or 

1002 init-time pipeline graph. 

1003 

1004 Parameters 

1005 ---------- 

1006 init : `bool`, optional 

1007 If `True` (`False` is default) return the graph of task 

1008 initialization nodes and init input/output dataset types, instead 

1009 of the graph of runtime task nodes and regular 

1010 input/output/prerequisite dataset types. 

1011 

1012 Returns 

1013 ------- 

1014 xgraph : `networkx.MultiDiGraph` 

1015 Directed acyclic graph with parallel edges. 

1016 

1017 Notes 

1018 ----- 

1019 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

1020 represent the same dataset type appearing in multiple connections for 

1021 the same task, and are hence rare. The connection name is used as the 

1022 edge key to disambiguate those parallel edges. 

1023 

1024 This graph is bipartite because each dataset type node only has edges 

1025 that connect it to a task [init] node, and vice versa. 

1026 

1027 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

1028 `WriteEdge` for the descriptive node and edge attributes added. 

1029 """ 

1030 return self._transform_xgraph_state( 

1031 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False 

1032 ) 

1033 

1034 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph: 

1035 """Return a networkx representation of just the tasks in the pipeline. 

1036 

1037 Parameters 

1038 ---------- 

1039 init : `bool`, optional 

1040 If `True` (`False` is default) return the graph of task 

1041 initialization nodes, instead of the graph of runtime task nodes. 

1042 

1043 Returns 

1044 ------- 

1045 xgraph : `networkx.DiGraph` 

1046 Directed acyclic graph with no parallel edges. 

1047 

1048 Notes 

1049 ----- 

1050 The returned graph uses `NodeKey` instances for nodes. The dataset 

1051 types that link these tasks are not represented at all; edges have no 

1052 attributes, and there are no parallel edges. 

1053 

1054 See `TaskNode` and `TaskInitNode` for the descriptive node and 

1055 attributes added. 

1056 """ 

1057 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

1058 task_keys = [ 

1059 key 

1060 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1061 if bipartite == NodeType.TASK.bipartite 

1062 ] 

1063 return self._transform_xgraph_state( 

1064 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys), 

1065 skip_edges=True, 

1066 ) 

1067 

1068 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph: 

1069 """Return a networkx representation of just the dataset types in the 

1070 pipeline. 

1071 

1072 Parameters 

1073 ---------- 

1074 init : `bool`, optional 

1075 If `True` (`False` is default) return the graph of init input and 

1076 output dataset types, instead of the graph of runtime (input, 

1077 output, prerequisite input) dataset types. 

1078 

1079 Returns 

1080 ------- 

1081 xgraph : `networkx.DiGraph` 

1082 Directed acyclic graph with no parallel edges. 

1083 

1084 Notes 

1085 ----- 

1086 The returned graph uses `NodeKey` instances for nodes. The tasks that 

1087 link these tasks are not represented at all; edges have no attributes, 

1088 and there are no parallel edges. 

1089 

1090 See `DatasetTypeNode` for the descriptive node and attributes added. 

1091 """ 

1092 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

1093 dataset_type_keys = [ 

1094 key 

1095 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1096 if bipartite == NodeType.DATASET_TYPE.bipartite 

1097 ] 

1098 return self._transform_xgraph_state( 

1099 networkx.algorithms.bipartite.projected_graph( 

1100 networkx.DiGraph(bipartite_xgraph), dataset_type_keys 

1101 ), 

1102 skip_edges=True, 

1103 ) 

1104 

1105 ########################################################################### 

1106 # 

1107 # Serialization Interface. 

1108 # 

1109 # Serialization of PipelineGraphs is currently experimental and may not be 

1110 # retained in the future. All serialization methods are 

1111 # underscore-prefixed to ensure nobody mistakes them for a stable interface 

1112 # (let a lone a stable file format). 

1113 # 

1114 ########################################################################### 

1115 

1116 @classmethod 

1117 def _read_stream( 

1118 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1119 ) -> PipelineGraph: 

1120 """Read a serialized `PipelineGraph` from a file-like object. 

1121 

1122 Parameters 

1123 ---------- 

1124 stream : `BinaryIO` 

1125 File-like object opened for binary reading, containing 

1126 gzip-compressed JSON. 

1127 import_mode : `TaskImportMode`, optional 

1128 Whether to import tasks, and how to reconcile any differences 

1129 between the imported task's connections and the those that were 

1130 persisted with the graph. Default is to check that they are the 

1131 same. 

1132 

1133 Returns 

1134 ------- 

1135 graph : `PipelineGraph` 

1136 Deserialized pipeline graph. 

1137 

1138 Raises 

1139 ------ 

1140 PipelineGraphReadError 

1141 Raised if the serialized `PipelineGraph` is not self-consistent. 

1142 EdgesChangedError 

1143 Raised if ``import_mode`` is 

1144 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1145 did change after import and reconfiguration. 

1146 

1147 Notes 

1148 ----- 

1149 `PipelineGraph` serialization is currently experimental and may be 

1150 removed or significantly changed in the future, with no deprecation 

1151 period. 

1152 """ 

1153 from .io import SerializedPipelineGraph 

1154 

1155 with gzip.open(stream, "rb") as uncompressed_stream: 

1156 data = json.load(uncompressed_stream) 

1157 serialized_graph = SerializedPipelineGraph.model_validate(data) 

1158 return serialized_graph.deserialize(import_mode) 

1159 

1160 @classmethod 

1161 def _read_uri( 

1162 cls, 

1163 uri: ResourcePathExpression, 

1164 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES, 

1165 ) -> PipelineGraph: 

1166 """Read a serialized `PipelineGraph` from a file at a URI. 

1167 

1168 Parameters 

1169 ---------- 

1170 uri : convertible to `lsst.resources.ResourcePath` 

1171 URI to a gzip-compressed JSON file containing a serialized pipeline 

1172 graph. 

1173 import_mode : `TaskImportMode`, optional 

1174 Whether to import tasks, and how to reconcile any differences 

1175 between the imported task's connections and the those that were 

1176 persisted with the graph. Default is to check that they are the 

1177 same. 

1178 

1179 Returns 

1180 ------- 

1181 graph : `PipelineGraph` 

1182 Deserialized pipeline graph. 

1183 

1184 Raises 

1185 ------ 

1186 PipelineGraphReadError 

1187 Raised if the serialized `PipelineGraph` is not self-consistent. 

1188 EdgesChangedError 

1189 Raised if ``import_mode`` is 

1190 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1191 did change after import and reconfiguration. 

1192 

1193 Notes 

1194 ----- 

1195 `PipelineGraph` serialization is currently experimental and may be 

1196 removed or significantly changed in the future, with no deprecation 

1197 period. 

1198 """ 

1199 uri = ResourcePath(uri) 

1200 with uri.open("rb") as stream: 

1201 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode) 

1202 

1203 def _write_stream(self, stream: BinaryIO) -> None: 

1204 """Write the pipeline to a file-like object. 

1205 

1206 Parameters 

1207 ---------- 

1208 stream 

1209 File-like object opened for binary writing. 

1210 

1211 Notes 

1212 ----- 

1213 `PipelineGraph` serialization is currently experimental and may be 

1214 removed or significantly changed in the future, with no deprecation 

1215 period. 

1216 

1217 The file format is gzipped JSON, and is intended to be human-readable, 

1218 but it should not be considered a stable public interface for outside 

1219 code, which should always use `PipelineGraph` methods (or at least the 

1220 `io.SerializedPipelineGraph` class) to read these files. 

1221 """ 

1222 from .io import SerializedPipelineGraph 

1223 

1224 with gzip.open(stream, mode="wb") as compressed_stream: 

1225 compressed_stream.write( 

1226 SerializedPipelineGraph.serialize(self).model_dump_json(exclude_defaults=True).encode("utf-8") 

1227 ) 

1228 

1229 def _write_uri(self, uri: ResourcePathExpression) -> None: 

1230 """Write the pipeline to a file given a URI. 

1231 

1232 Parameters 

1233 ---------- 

1234 uri : convertible to `lsst.resources.ResourcePath` 

1235 URI to write to . May have ``.json.gz`` or no extension (which 

1236 will cause a ``.json.gz`` extension to be added). 

1237 

1238 Notes 

1239 ----- 

1240 `PipelineGraph` serialization is currently experimental and may be 

1241 removed or significantly changed in the future, with no deprecation 

1242 period. 

1243 

1244 The file format is gzipped JSON, and is intended to be human-readable, 

1245 but it should not be considered a stable public interface for outside 

1246 code, which should always use `PipelineGraph` methods (or at least the 

1247 `io.SerializedPipelineGraph` class) to read these files. 

1248 """ 

1249 uri = ResourcePath(uri) 

1250 extension = uri.getExtension() 

1251 if not extension: 

1252 uri = uri.updatedExtension(".json.gz") 

1253 elif extension != ".json.gz": 

1254 raise ValueError("Expanded pipeline files should always have a .json.gz extension.") 

1255 with uri.open(mode="wb") as stream: 

1256 self._write_stream(cast(BinaryIO, stream)) 

1257 

1258 def _import_and_configure( 

1259 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1260 ) -> None: 

1261 """Import the `PipelineTask` classes referenced by all task nodes and 

1262 update those nodes accordingly. 

1263 

1264 Parameters 

1265 ---------- 

1266 import_mode : `TaskImportMode`, optional 

1267 Whether to import tasks, and how to reconcile any differences 

1268 between the imported task's connections and the those that were 

1269 persisted with the graph. Default is to check that they are the 

1270 same. This method does nothing if this is 

1271 `TaskImportMode.DO_NOT_IMPORT`. 

1272 

1273 Raises 

1274 ------ 

1275 EdgesChangedError 

1276 Raised if ``import_mode`` is 

1277 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1278 did change after import and reconfiguration. 

1279 

1280 Notes 

1281 ----- 

1282 This method shouldn't need to be called unless the graph was 

1283 deserialized without importing and configuring immediately, which is 

1284 not the default behavior (but it can greatly speed up deserialization). 

1285 If all tasks have already been imported this does nothing. 

1286 

1287 Importing and configuring a task can change its 

1288 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output, 

1289 usually because the software used to read a serialized graph is newer 

1290 than the software used to write it (e.g. a new config option has been 

1291 added, or the task was moved to a new module with a forwarding alias 

1292 left behind). These changes are allowed by 

1293 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`. 

1294 

1295 If importing and configuring a task causes its edges to change, any 

1296 dataset type nodes linked to those edges will be reset to the 

1297 unresolved state. 

1298 """ 

1299 if import_mode is TaskImportMode.DO_NOT_IMPORT: 

1300 return 

1301 rebuild = ( 

1302 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1303 or import_mode is TaskImportMode.OVERRIDE_EDGES 

1304 ) 

1305 updates: dict[str, TaskNode] = {} 

1306 node_key: NodeKey 

1307 for node_key, node_state in self._xgraph.nodes.items(): 

1308 if node_key.node_type is NodeType.TASK: 

1309 task_node: TaskNode = node_state["instance"] 

1310 new_task_node = task_node._imported_and_configured(rebuild) 

1311 if new_task_node is not task_node: 

1312 updates[task_node.label] = new_task_node 

1313 self._replace_task_nodes( 

1314 updates, 

1315 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES), 

1316 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES), 

1317 message_header=( 

1318 "In task with label {task_label!r}, persisted edges (A)" 

1319 "differ from imported and configured edges (B):" 

1320 ), 

1321 ) 

1322 

1323 ########################################################################### 

1324 # 

1325 # Advanced PipelineGraph Inspection Interface: 

1326 # 

1327 # - methods to iterate over all nodes and edges, utilizing NodeKeys; 

1328 # 

1329 # - methods to find overall inputs and group nodes by their dimensions, 

1330 # which are important operations for QuantumGraph generation. 

1331 # 

1332 ########################################################################### 

1333 

1334 def iter_edges(self, init: bool = False) -> Iterator[Edge]: 

1335 """Iterate over edges in the graph. 

1336 

1337 Parameters 

1338 ---------- 

1339 init : `bool`, optional 

1340 If `True` (`False` is default) iterate over the edges between task 

1341 initialization node and init input/output dataset types, instead of 

1342 the runtime task nodes and regular input/output/prerequisite 

1343 dataset types. 

1344 

1345 Returns 

1346 ------- 

1347 edges : `~collections.abc.Iterator` [ `Edge` ] 

1348 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances. 

1349 

1350 Notes 

1351 ----- 

1352 This method always returns _either_ init edges or runtime edges, never 

1353 both. The full (internal) graph that contains both also includes a 

1354 special edge that connects each task init node to its runtime node; 

1355 that is also never returned by this method, since it is never a part of 

1356 the init-only or runtime-only subgraphs. 

1357 """ 

1358 edge: Edge 

1359 for _, _, edge in self._xgraph.edges(data="instance"): 

1360 if edge is not None and edge.is_init == init: 

1361 yield edge 

1362 

1363 def iter_nodes( 

1364 self, 

1365 ) -> Iterator[ 

1366 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode] 

1367 | tuple[Literal[NodeType.TASK], str, TaskInitNode] 

1368 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None] 

1369 ]: 

1370 """Iterate over nodes in the graph. 

1371 

1372 Returns 

1373 ------- 

1374 nodes : `~collections.abc.Iterator` [ `tuple` ] 

1375 A lazy iterator over all of the nodes in the graph. Each yielded 

1376 element is a tuple of: 

1377 

1378 - the node type enum value (`NodeType`); 

1379 - the string name for the node (task label or parent dataset type 

1380 name); 

1381 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`, 

1382 or `None` for dataset type nodes that have not been resolved). 

1383 """ 

1384 key: NodeKey 

1385 if self._sorted_keys is not None: 

1386 for key in self._sorted_keys: 

1387 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore 

1388 else: 

1389 for key, node in self._xgraph.nodes(data="instance"): 

1390 yield key.node_type, key.name, node # type: ignore 

1391 

1392 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]: 

1393 """Iterate over all of the dataset types that are consumed but not 

1394 produced by the graph. 

1395 

1396 Returns 

1397 ------- 

1398 dataset_types : `~collections.abc.Iterator` [ `tuple` ] 

1399 A lazy iterator over the overall-input dataset types (including 

1400 overall init inputs and prerequisites). Each yielded element is a 

1401 tuple of: 

1402 

1403 - the parent dataset type name; 

1404 - the resolved `DatasetTypeNode`, or `None` if the dataset type has 

1405 - not been resolved. 

1406 """ 

1407 for generation in networkx.algorithms.dag.topological_generations(self._xgraph): 

1408 key: NodeKey 

1409 for key in generation: 

1410 # While we expect all tasks to have at least one input and 

1411 # hence never appear in the first topological generation, that 

1412 # is not true of task init nodes. 

1413 if key.node_type is NodeType.DATASET_TYPE: 

1414 yield key.name, self._xgraph.nodes[key]["instance"] 

1415 return 

1416 

1417 def group_by_dimensions( 

1418 self, prerequisites: bool = False 

1419 ) -> dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]: 

1420 """Group this graph's tasks and dataset types by their dimensions. 

1421 

1422 Parameters 

1423 ---------- 

1424 prerequisites : `bool`, optional 

1425 If `True`, include prerequisite dataset types as well as regular 

1426 input and output datasets (including intermediates). 

1427 

1428 Returns 

1429 ------- 

1430 groups : `dict` [ `DimensionGroup`, `tuple` ] 

1431 A dictionary of groups keyed by `DimensionGroup`, in which each 

1432 value is a tuple of: 

1433 

1434 - a `dict` of `TaskNode` instances, keyed by task label 

1435 - a `dict` of `DatasetTypeNode` instances, keyed by 

1436 dataset type name. 

1437 

1438 that have those dimensions. 

1439 

1440 Notes 

1441 ----- 

1442 Init inputs and outputs are always included, but always have empty 

1443 dimensions and are hence are all grouped together. 

1444 """ 

1445 result: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {} 

1446 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {}) 

1447 for task_label, task_node in self.tasks.items(): 

1448 if task_node.dimensions is None: 

1449 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.") 

1450 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value: 

1451 next_new_value = ({}, {}) # make new lists for next time 

1452 group[0][task_node.label] = task_node 

1453 for dataset_type_name, dataset_type_node in self.dataset_types.items(): 

1454 if dataset_type_node is None: 

1455 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.") 

1456 if not dataset_type_node.is_prerequisite or prerequisites: 

1457 if ( 

1458 group := result.setdefault( 

1459 dataset_type_node.dataset_type.dimensions.as_group(), next_new_value 

1460 ) 

1461 ) is next_new_value: 

1462 next_new_value = ({}, {}) # make new lists for next time 

1463 group[1][dataset_type_node.name] = dataset_type_node 

1464 return result 

1465 

1466 def split_independent(self) -> Iterable[PipelineGraph]: 

1467 """Iterate over independent subgraphs that together comprise this 

1468 pipeline graph. 

1469 

1470 Returns 

1471 ------- 

1472 subgraphs : `Iterable` [ `PipelineGraph` ] 

1473 An iterable over component subgraphs that could be run 

1474 independently (they have only overall inputs in common). May be a 

1475 lazy iterator. 

1476 

1477 Notes 

1478 ----- 

1479 All resolved dataset type nodes will be preserved. 

1480 

1481 If there is only one component, ``self`` may be returned as the only 

1482 element in the iterable. 

1483 

1484 If `has_been_sorted`, all subgraphs will be sorted as well. 

1485 """ 

1486 # Having an overall input in common isn't enough to make subgraphs 

1487 # dependent on each other, so we want to look for connected component 

1488 # subgraphs of the task-only projected graph. 

1489 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False) 

1490 task_keys = { 

1491 key 

1492 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1493 if bipartite == NodeType.TASK.bipartite 

1494 } 

1495 task_xgraph = networkx.algorithms.bipartite.projected_graph( 

1496 networkx.DiGraph(bipartite_xgraph), task_keys 

1497 ) 

1498 # "Weakly" connected means connected in only one direction, which is 

1499 # the only kind of "connected" a DAG can ever be. 

1500 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph): 

1501 if component_task_keys == task_keys: 

1502 yield self 

1503 return 

1504 else: 

1505 component_subgraph = PipelineGraph(universe=self._universe) 

1506 component_subgraph.add_task_nodes( 

1507 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self 

1508 ) 

1509 if self.has_been_sorted: 

1510 component_subgraph.sort() 

1511 yield component_subgraph 

1512 

1513 ########################################################################### 

1514 # 

1515 # Class- and Package-Private Methods. 

1516 # 

1517 ########################################################################### 

1518 

1519 def _iter_task_defs(self) -> Iterator[TaskDef]: 

1520 """Iterate over this pipeline as a sequence of `TaskDef` instances. 

1521 

1522 Notes 

1523 ----- 

1524 This is a package-private method intended to aid in the transition to a 

1525 codebase more fully integrated with the `PipelineGraph` class, in which 

1526 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and 

1527 much of the functionality on the `Pipeline` class will be moved to 

1528 `PipelineGraph` as well. 

1529 

1530 Raises 

1531 ------ 

1532 TaskNotImportedError 

1533 Raised if `TaskNode.is_imported` is `False` for any task. 

1534 """ 

1535 from ..pipeline import TaskDef 

1536 

1537 for node in self._tasks.values(): 

1538 yield TaskDef( 

1539 config=node.config, 

1540 taskClass=node.task_class, 

1541 label=node.label, 

1542 connections=node.get_connections(), 

1543 ) 

1544 

1545 def _init_from_args( 

1546 self, 

1547 xgraph: networkx.MultiDiGraph | None, 

1548 sorted_keys: Sequence[NodeKey] | None, 

1549 task_subsets: dict[str, TaskSubset] | None, 

1550 description: str, 

1551 universe: DimensionUniverse | None, 

1552 data_id: DataId | None, 

1553 ) -> None: 

1554 """Initialize the graph with possibly-nontrivial arguments. 

1555 

1556 Parameters 

1557 ---------- 

1558 xgraph : `networkx.MultiDiGraph` or `None` 

1559 The backing networkx graph, or `None` to create an empty one. 

1560 This graph has `NodeKey` instances for nodes and the same structure 

1561 as the graph exported by `make_xgraph`, but its nodes and edges 

1562 have a single ``instance`` attribute that holds a `TaskNode`, 

1563 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or 

1564 `WriteEdge` instance. 

1565 sorted_keys : `Sequence` [ `NodeKey` ] or `None` 

1566 Topologically sorted sequence of node keys, or `None` if the graph 

1567 is not sorted. 

1568 task_subsets : `dict` [ `str`, `TaskSubset` ] 

1569 Labeled subsets of tasks. Values must be constructed with 

1570 ``xgraph`` as their parent graph. 

1571 description : `str` 

1572 String description for this pipeline. 

1573 universe : `lsst.daf.butler.DimensionUniverse` or `None` 

1574 Definitions of all dimensions. 

1575 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping. 

1576 Data ID that represents a constraint on all quanta generated from 

1577 this pipeline. 

1578 

1579 Notes 

1580 ----- 

1581 Only empty `PipelineGraph` instances should be constructed directly by 

1582 users, which sets the signature of ``__init__`` itself, but methods on 

1583 `PipelineGraph` and its helper classes need to be able to create them 

1584 with state. Those methods can call this after calling ``__new__`` 

1585 manually, skipping ``__init__``. 

1586 """ 

1587 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph() 

1588 self._sorted_keys: Sequence[NodeKey] | None = None 

1589 self._task_subsets = task_subsets if task_subsets is not None else {} 

1590 self._description = description 

1591 self._tasks = TaskMappingView(self._xgraph) 

1592 self._dataset_types = DatasetTypeMappingView(self._xgraph) 

1593 self._raw_data_id: dict[str, Any] 

1594 if isinstance(data_id, DataCoordinate): 

1595 if universe is None: 

1596 universe = data_id.universe 

1597 else: 

1598 assert universe is data_id.universe, "data_id.universe and given universe differ" 

1599 self._raw_data_id = dict(data_id.required) 

1600 elif data_id is None: 

1601 self._raw_data_id = {} 

1602 else: 

1603 self._raw_data_id = dict(data_id) 

1604 self._universe = universe 

1605 if sorted_keys is not None: 

1606 self._reorder(sorted_keys) 

1607 

1608 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph: 

1609 """Make a bipartite init-only or runtime-only internal subgraph. 

1610 

1611 See `make_bipartite_xgraph` for parameters and return values. 

1612 

1613 Notes 

1614 ----- 

1615 This method returns a view of the `PipelineGraph` object's internal 

1616 backing graph, and hence should only be called in methods that copy the 

1617 result either explicitly or by running a copying algorithm before 

1618 returning it to the user. 

1619 """ 

1620 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)]) 

1621 

1622 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G: 

1623 """Transform networkx graph attributes in-place from the internal 

1624 "instance" attributes to the documented exported attributes. 

1625 

1626 Parameters 

1627 ---------- 

1628 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1629 Graph whose state should be transformed. 

1630 skip_edges : `bool` 

1631 If `True`, do not transform edge state. 

1632 

1633 Returns 

1634 ------- 

1635 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1636 The same object passed in, after modification. 

1637 

1638 Notes 

1639 ----- 

1640 This should be called after making a copy of the internal graph but 

1641 before any projection down to just task or dataset type nodes, since 

1642 it assumes stateful edges. 

1643 """ 

1644 state: dict[str, Any] 

1645 for state in xgraph.nodes.values(): 

1646 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance") 

1647 if node_value is not None: 

1648 state.update(node_value._to_xgraph_state()) 

1649 else: 

1650 # This is a dataset type node that is not resolved. 

1651 state["bipartite"] = NodeType.DATASET_TYPE.bipartite 

1652 if not skip_edges: 

1653 for _, _, state in xgraph.edges(data=True): 

1654 edge: Edge | None = state.pop("instance", None) 

1655 if edge is not None: 

1656 state.update(edge._to_xgraph_state()) 

1657 return xgraph 

1658 

1659 def _replace_task_nodes( 

1660 self, 

1661 updates: Mapping[str, TaskNode], 

1662 check_edges_unchanged: bool, 

1663 assume_edges_unchanged: bool, 

1664 message_header: str, 

1665 ) -> None: 

1666 """Replace task nodes and update edges and dataset type nodes 

1667 accordingly. 

1668 

1669 Parameters 

1670 ---------- 

1671 updates : `Mapping` [ `str`, `TaskNode` ] 

1672 New task nodes with task label keys. All keys must be task labels 

1673 that are already present in the graph. 

1674 check_edges_unchanged : `bool`, optional 

1675 If `True`, require the edges (connections) of the modified tasks to 

1676 remain unchanged after importing and configuring each task, and 

1677 verify that this is the case. 

1678 assume_edges_unchanged : `bool`, optional 

1679 If `True`, the caller declares that the edges (connections) of the 

1680 modified tasks will remain unchanged importing and configuring each 

1681 task, and that it is unnecessary to check this. 

1682 message_header : `str` 

1683 Template for `str.format` with a single ``task_label`` placeholder 

1684 to use as the first line in `EdgesChangedError` messages that show 

1685 the differences between new task edges and old task edges. Should 

1686 include the fact that the rest of the message will refer to the old 

1687 task as "A" and the new task as "B", and end with a colon. 

1688 

1689 Raises 

1690 ------ 

1691 ValueError 

1692 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

1693 are both `True`, or if a full config is provided for a task after 

1694 another full config or an override has already been provided. 

1695 EdgesChangedError 

1696 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

1697 change. 

1698 """ 

1699 deep: dict[str, TaskNode] = {} 

1700 shallow: dict[str, TaskNode] = {} 

1701 if assume_edges_unchanged: 

1702 if check_edges_unchanged: 

1703 raise ValueError("Cannot simultaneously assume and check that edges have not changed.") 

1704 shallow.update(updates) 

1705 else: 

1706 for task_label, new_task_node in updates.items(): 

1707 old_task_node = self.tasks[task_label] 

1708 messages = old_task_node.diff_edges(new_task_node) 

1709 if messages: 

1710 if check_edges_unchanged: 

1711 messages.insert(0, message_header.format(task_label=task_label)) 

1712 raise EdgesChangedError("\n".join(messages)) 

1713 else: 

1714 deep[task_label] = new_task_node 

1715 else: 

1716 shallow[task_label] = new_task_node 

1717 try: 

1718 if deep: 

1719 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True) 

1720 self.add_task_nodes(deep.values()) 

1721 for replaced_task_node, referencing_subsets in removed: 

1722 for subset_label in referencing_subsets: 

1723 self._task_subsets[subset_label].add(replaced_task_node.label) 

1724 for task_node in shallow.values(): 

1725 self._xgraph.nodes[task_node.key]["instance"] = task_node 

1726 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init 

1727 except PipelineGraphExceptionSafetyError: # pragma: no cover 

1728 raise 

1729 except Exception as err: # pragma: no cover 

1730 # There's no known way to get here, but we want to make it clear 

1731 # it's a big problem if we do. 

1732 raise PipelineGraphExceptionSafetyError( 

1733 "Error while replacing tasks has left the graph in an inconsistent state." 

1734 ) from err 

1735 

1736 def _append_graph_data_from_edge( 

1737 self, 

1738 node_data: list[tuple[NodeKey, dict[str, Any]]], 

1739 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]], 

1740 edge: Edge, 

1741 parent: PipelineGraph | None, 

1742 ) -> None: 

1743 """Append networkx state dictionaries for an edge and the corresponding 

1744 dataset type node. 

1745 

1746 Parameters 

1747 ---------- 

1748 node_data : `list` 

1749 List of node keys and state dictionaries. A node is appended if 

1750 one does not already exist for this dataset type. 

1751 edge_data : `list` 

1752 List of node key pairs, connection names, and state dictionaries 

1753 for edges. 

1754 edge : `Edge` 

1755 New edge being processed. 

1756 parent : `PipelineGraph` or `None` 

1757 Another pipeline graph whose dataset type nodes should be used 

1758 when present. 

1759 """ 

1760 new_dataset_type_node = None 

1761 if parent is not None: 

1762 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance") 

1763 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None: 

1764 existing_dataset_type_state["instance"] = new_dataset_type_node 

1765 else: 

1766 node_data.append( 

1767 ( 

1768 edge.dataset_type_key, 

1769 { 

1770 "instance": new_dataset_type_node, 

1771 "bipartite": NodeType.DATASET_TYPE.bipartite, 

1772 }, 

1773 ) 

1774 ) 

1775 edge_data.append( 

1776 edge.nodes 

1777 + ( 

1778 edge.connection_name, 

1779 {"instance": edge}, 

1780 ) 

1781 ) 

1782 

1783 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None: 

1784 """Set the order of all views of this graph from the given sorted 

1785 sequence of task labels and dataset type names. 

1786 """ 

1787 self._sorted_keys = sorted_keys 

1788 self._tasks._reorder(sorted_keys) 

1789 self._dataset_types._reorder(sorted_keys) 

1790 

1791 def _reset(self) -> None: 

1792 """Reset the all views of this graph following a modification that 

1793 might invalidate them. 

1794 """ 

1795 self._sorted_keys = None 

1796 self._tasks._reset() 

1797 self._dataset_types._reset() 

1798 

1799 _xgraph: networkx.MultiDiGraph 

1800 _sorted_keys: Sequence[NodeKey] | None 

1801 _task_subsets: dict[str, TaskSubset] 

1802 _description: str 

1803 _tasks: TaskMappingView 

1804 _dataset_types: DatasetTypeMappingView 

1805 _raw_data_id: dict[str, Any] 

1806 _universe: DimensionUniverse | None