Coverage for python/lsst/pipe/base/pipeline_graph/_pipeline_graph.py: 19%

407 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-10 03:25 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("PipelineGraph",) 

30 

31import gzip 

32import itertools 

33import json 

34from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence, Set 

35from typing import TYPE_CHECKING, Any, BinaryIO, Literal, TypeVar, cast 

36 

37import networkx 

38import networkx.algorithms.bipartite 

39import networkx.algorithms.dag 

40from lsst.daf.butler import DataCoordinate, DataId, DatasetType, DimensionGroup, DimensionUniverse, Registry 

41from lsst.daf.butler.registry import MissingDatasetTypeError 

42from lsst.resources import ResourcePath, ResourcePathExpression 

43 

44from ._dataset_types import DatasetTypeNode 

45from ._edges import Edge, ReadEdge, WriteEdge 

46from ._exceptions import ( 

47 DuplicateOutputError, 

48 EdgesChangedError, 

49 PipelineDataCycleError, 

50 PipelineGraphError, 

51 PipelineGraphExceptionSafetyError, 

52 UnresolvedGraphError, 

53) 

54from ._mapping_views import DatasetTypeMappingView, TaskMappingView 

55from ._nodes import NodeKey, NodeType 

56from ._task_subsets import TaskSubset 

57from ._tasks import TaskImportMode, TaskInitNode, TaskNode, _TaskNodeImportedData 

58 

59if TYPE_CHECKING: 

60 from ..config import PipelineTaskConfig 

61 from ..connections import PipelineTaskConnections 

62 from ..pipeline import TaskDef 

63 from ..pipelineTask import PipelineTask 

64 

65 

66_G = TypeVar("_G", bound=networkx.DiGraph | networkx.MultiDiGraph) 

67 

68 

69class PipelineGraph: 

70 """A graph representation of fully-configured pipeline. 

71 

72 `PipelineGraph` instances are typically constructed by calling 

73 `.Pipeline.to_graph`, but in rare cases constructing and then populating an 

74 empty one may be preferable. 

75 

76 Parameters 

77 ---------- 

78 description : `str`, optional 

79 String description for this pipeline. 

80 universe : `lsst.daf.butler.DimensionUniverse`, optional 

81 Definitions for all butler dimensions. If not provided, some 

82 attributes will not be available until `resolve` is called. 

83 data_id : `lsst.daf.butler.DataCoordinate` or other data ID, optional 

84 Data ID that represents a constraint on all quanta generated by this 

85 pipeline. This typically just holds the instrument constraint included 

86 in the pipeline definition, if there was one. 

87 """ 

88 

89 ########################################################################### 

90 # 

91 # Simple Pipeline Graph Inspection Interface: 

92 # 

93 # - for inspecting graph structure, not modifying it (except to sort and] 

94 # resolve); 

95 # 

96 # - no NodeKey objects, just string dataset type name and task label keys; 

97 # 

98 # - graph structure is represented as a pair of mappings, with methods to 

99 # find neighbors and edges of nodes. 

100 # 

101 ########################################################################### 

102 

103 def __init__( 

104 self, 

105 *, 

106 description: str = "", 

107 universe: DimensionUniverse | None = None, 

108 data_id: DataId | None = None, 

109 ) -> None: 

110 self._init_from_args( 

111 xgraph=None, 

112 sorted_keys=None, 

113 task_subsets=None, 

114 description=description, 

115 universe=universe, 

116 data_id=data_id, 

117 ) 

118 

119 def __repr__(self) -> str: 

120 return f"{type(self).__name__}({self.description!r}, tasks={self.tasks!s})" 

121 

122 @property 

123 def description(self) -> str: 

124 """String description for this pipeline.""" 

125 return self._description 

126 

127 @description.setter 

128 def description(self, value: str) -> None: 

129 # Docstring in setter. 

130 self._description = value 

131 

132 @property 

133 def universe(self) -> DimensionUniverse | None: 

134 """Definitions for all butler dimensions.""" 

135 return self._universe 

136 

137 @property 

138 def data_id(self) -> DataCoordinate: 

139 """Data ID that represents a constraint on all quanta generated from 

140 this pipeline. 

141 

142 This is may not be available unless `universe` is not `None`. 

143 """ 

144 return DataCoordinate.standardize(self._raw_data_id, universe=self.universe) 

145 

146 @property 

147 def tasks(self) -> TaskMappingView: 

148 """A mapping view of the tasks in the graph. 

149 

150 This mapping has `str` task label keys and `TaskNode` values. Iteration 

151 is topologically and deterministically ordered if and only if `sort` 

152 has been called since the last modification to the graph. 

153 """ 

154 return self._tasks 

155 

156 @property 

157 def dataset_types(self) -> DatasetTypeMappingView: 

158 """A mapping view of the dataset types in the graph. 

159 

160 This mapping has `str` parent dataset type name keys, but only provides 

161 access to its `DatasetTypeNode` values if `resolve` has been called 

162 since the last modification involving a task that uses a dataset type. 

163 See `DatasetTypeMappingView` for details. 

164 """ 

165 return self._dataset_types 

166 

167 @property 

168 def task_subsets(self) -> Mapping[str, TaskSubset]: 

169 """A mapping of all labeled subsets of tasks. 

170 

171 Keys are subset labels, values are sets of task labels. See 

172 `TaskSubset` for more information. 

173 

174 Use `add_task_subset` to add a new subset. The subsets themselves may 

175 be modified in-place. 

176 """ 

177 return self._task_subsets 

178 

179 @property 

180 def is_fully_resolved(self) -> bool: 

181 """Whether all of this graph's nodes are resolved.""" 

182 return self._universe is not None and all( 

183 self.dataset_types.is_resolved(k) for k in self.dataset_types 

184 ) 

185 

186 @property 

187 def is_sorted(self) -> bool: 

188 """Whether this graph's tasks and dataset types are topologically 

189 sorted with the exact same deterministic tiebreakers that `sort` would 

190 apply. 

191 

192 This may perform (and then discard) a full sort if `has_been_sorted` is 

193 `False`. If the goal is to obtain a sorted graph, it is better to just 

194 call `sort` without guarding that with an ``if not graph.is_sorted`` 

195 check. 

196 """ 

197 if self._sorted_keys is not None: 

198 return True 

199 return all( 

200 sorted == unsorted 

201 for sorted, unsorted in zip( 

202 networkx.lexicographical_topological_sort(self._xgraph), self._xgraph, strict=True 

203 ) 

204 ) 

205 

206 @property 

207 def has_been_sorted(self) -> bool: 

208 """Whether this graph's tasks and dataset types have been 

209 topologically sorted (with unspecified but deterministic tiebreakers) 

210 since the last modification to the graph. 

211 

212 This may return `False` if the graph *happens* to be sorted but `sort` 

213 was never called, but it is potentially much faster than `is_sorted`, 

214 which may attempt (and then discard) a full sort if `has_been_sorted` 

215 is `False`. 

216 """ 

217 return self._sorted_keys is not None 

218 

219 def sort(self) -> None: 

220 """Sort this graph's nodes topologically with deterministic (but 

221 unspecified) tiebreakers. 

222 

223 This does nothing if the graph is already known to be sorted. 

224 """ 

225 if self._sorted_keys is None: 

226 try: 

227 sorted_keys: Sequence[NodeKey] = list(networkx.lexicographical_topological_sort(self._xgraph)) 

228 except networkx.NetworkXUnfeasible as err: # pragma: no cover 

229 # Should't be possible to get here, because we check for cycles 

230 # when adding tasks, but we guard against it anyway. 

231 cycle = networkx.find_cycle(self._xgraph) 

232 raise PipelineDataCycleError( 

233 f"Cycle detected while attempting to sort graph: {cycle}." 

234 ) from err 

235 self._reorder(sorted_keys) 

236 

237 def copy(self) -> PipelineGraph: 

238 """Return a copy of this graph that copies all mutable state.""" 

239 xgraph = self._xgraph.copy() 

240 result = PipelineGraph.__new__(PipelineGraph) 

241 result._init_from_args( 

242 xgraph, 

243 self._sorted_keys, 

244 task_subsets={ 

245 k: TaskSubset(xgraph, v.label, set(v._members), v.description) 

246 for k, v in self._task_subsets.items() 

247 }, 

248 description=self._description, 

249 universe=self.universe, 

250 data_id=self._raw_data_id, 

251 ) 

252 return result 

253 

254 def __copy__(self) -> PipelineGraph: 

255 # Fully shallow copies are dangerous; we don't want shared mutable 

256 # state to lead to broken class invariants. 

257 return self.copy() 

258 

259 def __deepcopy__(self, memo: dict) -> PipelineGraph: 

260 # Genuine deep copies are unnecessary, since we should only ever care 

261 # that mutable state is copied. 

262 return self.copy() 

263 

264 def diff_tasks(self, other: PipelineGraph) -> list[str]: 

265 """Compare two pipeline graphs. 

266 

267 This only compares graph structure and task classes (including their 

268 edges). It does *not* compare full configuration (which is subject to 

269 spurious differences due to import-cache state), dataset type 

270 resolutions, or sort state. 

271 

272 Parameters 

273 ---------- 

274 other : `PipelineGraph` 

275 Graph to compare to. 

276 

277 Returns 

278 ------- 

279 differences : `list` [ `str` ] 

280 List of string messages describing differences between the 

281 pipelines. If empty, the graphs have the same tasks and 

282 connections. 

283 """ 

284 messages: list[str] = [] 

285 common_labels: Set[str] 

286 if self.tasks.keys() != other.tasks.keys(): 

287 common_labels = self.tasks.keys() & other.tasks.keys() 

288 messages.append( 

289 f"Pipelines have different tasks: A & ~B = {list(self.tasks.keys() - common_labels)}, " 

290 f"B & ~A = {list(other.tasks.keys() - common_labels)}." 

291 ) 

292 else: 

293 common_labels = self.tasks.keys() 

294 for label in common_labels: 

295 a = self.tasks[label] 

296 b = other.tasks[label] 

297 if a.task_class != b.task_class: 

298 messages.append( 

299 f"Task {label!r} has class {a.task_class_name} in A, " f"but {b.task_class_name} in B." 

300 ) 

301 messages.extend(a.diff_edges(b)) 

302 return messages 

303 

304 def producing_edge_of(self, dataset_type_name: str) -> WriteEdge | None: 

305 """Return the `WriteEdge` that links the producing task to the named 

306 dataset type. 

307 

308 Parameters 

309 ---------- 

310 dataset_type_name : `str` 

311 Dataset type name. Must not be a component. 

312 

313 Returns 

314 ------- 

315 edge : `WriteEdge` or `None` 

316 Producing edge or `None` if there isn't one in this graph. 

317 

318 Raises 

319 ------ 

320 DuplicateOutputError 

321 Raised if there are multiple tasks defined to produce this dataset 

322 type. This is only possible if the graph's dataset types are not 

323 resolved. 

324 

325 Notes 

326 ----- 

327 On resolved graphs, it may be slightly more efficient to use:: 

328 

329 graph.dataset_types[dataset_type_name].producing_edge 

330 

331 but this method works on graphs with unresolved dataset types as well. 

332 """ 

333 producer: str | None = None 

334 producing_edge: WriteEdge | None = None 

335 for _, _, producing_edge in self._xgraph.in_edges( 

336 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

337 ): 

338 assert producing_edge is not None, "Should only be None if we never loop." 

339 if producer is not None: 

340 raise DuplicateOutputError( 

341 f"Dataset type {dataset_type_name!r} is produced by both {producing_edge.task_label!r} " 

342 f"and {producer!r}." 

343 ) 

344 return producing_edge 

345 

346 def consuming_edges_of(self, dataset_type_name: str) -> list[ReadEdge]: 

347 """Return the `ReadEdge` objects that link the named dataset type to 

348 the tasks that consume it. 

349 

350 Parameters 

351 ---------- 

352 dataset_type_name : `str` 

353 Dataset type name. Must not be a component. 

354 

355 Returns 

356 ------- 

357 edges : `list` [ `ReadEdge` ] 

358 Edges that connect this dataset type to the tasks that consume it. 

359 

360 Notes 

361 ----- 

362 On resolved graphs, it may be slightly more efficient to use:: 

363 

364 graph.dataset_types[dataset_type_name].producing_edges 

365 

366 but this method works on graphs with unresolved dataset types as well. 

367 """ 

368 return [ 

369 edge 

370 for _, _, edge in self._xgraph.out_edges( 

371 NodeKey(NodeType.DATASET_TYPE, dataset_type_name), data="instance" 

372 ) 

373 ] 

374 

375 def producer_of(self, dataset_type_name: str) -> TaskNode | TaskInitNode | None: 

376 """Return the `TaskNode` or `TaskInitNode` that writes the given 

377 dataset type. 

378 

379 Parameters 

380 ---------- 

381 dataset_type_name : `str` 

382 Dataset type name. Must not be a component. 

383 

384 Returns 

385 ------- 

386 edge : `TaskNode`, `TaskInitNode`, or `None` 

387 Producing node or `None` if there isn't one in this graph. 

388 

389 Raises 

390 ------ 

391 DuplicateOutputError 

392 Raised if there are multiple tasks defined to produce this dataset 

393 type. This is only possible if the graph's dataset types are not 

394 resolved. 

395 """ 

396 if (producing_edge := self.producing_edge_of(dataset_type_name)) is not None: 

397 return self._xgraph.nodes[producing_edge.task_key]["instance"] 

398 return None 

399 

400 def consumers_of(self, dataset_type_name: str) -> list[TaskNode | TaskInitNode]: 

401 """Return the `TaskNode` and/or `TaskInitNode` objects that read 

402 the given dataset type. 

403 

404 Parameters 

405 ---------- 

406 dataset_type_name : `str` 

407 Dataset type name. Must not be a component. 

408 

409 Returns 

410 ------- 

411 edges : `list` [ `ReadEdge` ] 

412 Edges that connect this dataset type to the tasks that consume it. 

413 

414 Notes 

415 ----- 

416 On resolved graphs, it may be slightly more efficient to use:: 

417 

418 graph.dataset_types[dataset_type_name].producing_edges 

419 

420 but this method works on graphs with unresolved dataset types as well. 

421 """ 

422 return [ 

423 self._xgraph.nodes[consuming_edge.task_key]["instance"] 

424 for consuming_edge in self.consuming_edges_of(dataset_type_name) 

425 ] 

426 

427 def inputs_of(self, task_label: str, init: bool = False) -> dict[str, DatasetTypeNode | None]: 

428 """Return the dataset types that are inputs to a task. 

429 

430 Parameters 

431 ---------- 

432 task_label : `str` 

433 Label for the task in the pipeline. 

434 init : `bool`, optional 

435 If `True`, return init-input dataset types instead of runtime 

436 (including prerequisite) inputs. 

437 

438 Returns 

439 ------- 

440 inputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

441 Dictionary parent dataset type name keys and either 

442 `DatasetTypeNode` values (if the dataset type has been resolved) 

443 or `None` values. 

444 

445 Notes 

446 ----- 

447 To get the input edges of a task or task init node (which provide 

448 information about storage class overrides nd components) use:: 

449 

450 graph.tasks[task_label].iter_all_inputs() 

451 

452 or 

453 

454 graph.tasks[task_label].init.iter_all_inputs() 

455 

456 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

457 class. 

458 """ 

459 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

460 return { 

461 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

462 for edge in node.iter_all_inputs() 

463 } 

464 

465 def outputs_of( 

466 self, task_label: str, init: bool = False, include_automatic_connections: bool = True 

467 ) -> dict[str, DatasetTypeNode | None]: 

468 """Return the dataset types that are outputs of a task. 

469 

470 Parameters 

471 ---------- 

472 task_label : `str` 

473 Label for the task in the pipeline. 

474 init : `bool`, optional 

475 If `True`, return init-output dataset types instead of runtime 

476 outputs. 

477 include_automatic_connections : `bool`, optional 

478 Whether to include automatic connections such as configs, metadata, 

479 and logs. 

480 

481 Returns 

482 ------- 

483 outputs : `dict` [ `str`, `DatasetTypeNode` or `None` ] 

484 Dictionary parent dataset type name keys and either 

485 `DatasetTypeNode` values (if the dataset type has been resolved) 

486 or `None` values. 

487 

488 Notes 

489 ----- 

490 To get the input edges of a task or task init node (which provide 

491 information about storage class overrides nd components) use:: 

492 

493 graph.tasks[task_label].iter_all_outputs() 

494 

495 or 

496 

497 graph.tasks[task_label].init.iter_all_outputs() 

498 

499 or the various mapping attributes of the `TaskNode` and `TaskInitNode` 

500 class. 

501 """ 

502 node: TaskNode | TaskInitNode = self.tasks[task_label] if not init else self.tasks[task_label].init 

503 iterable = node.iter_all_outputs() if include_automatic_connections else node.outputs.values() 

504 return { 

505 edge.parent_dataset_type_name: self._xgraph.nodes[edge.dataset_type_key]["instance"] 

506 for edge in iterable 

507 } 

508 

509 def resolve( 

510 self, 

511 registry: Registry | None = None, 

512 dimensions: DimensionUniverse | None = None, 

513 dataset_types: Mapping[str, DatasetType] | None = None, 

514 ) -> None: 

515 """Resolve all dimensions and dataset types and check them for 

516 consistency. 

517 

518 Resolving a graph also causes it to be sorted. 

519 

520 Parameters 

521 ---------- 

522 registry : `lsst.daf.butler.Registry`, optional 

523 Client for the data repository to resolve against. If not 

524 provided, both ``dimensions`` and ``dataset_types`` must be. 

525 dimensions : `lsst.daf.butler.DimensionUniverse`, optional 

526 Definitions for all dimensions. 

527 dataset_types : `~collection.abc.Mapping` [ `str`, \ 

528 `~lsst.daf.butler.DatasetType` ], optional 

529 Mapping of dataset types to consider registered. 

530 

531 Notes 

532 ----- 

533 The `universe` attribute is set to ``dimensions`` and used to set all 

534 `TaskNode.dimensions` attributes. Dataset type nodes are resolved by 

535 first looking for a registry definition, then using the producing 

536 task's definition, then looking for consistency between all consuming 

537 task definitions. 

538 

539 Raises 

540 ------ 

541 ConnectionTypeConsistencyError 

542 Raised if a prerequisite input for one task appears as a different 

543 kind of connection in any other task. 

544 DuplicateOutputError 

545 Raised if multiple tasks have the same dataset type as an output. 

546 IncompatibleDatasetTypeError 

547 Raised if different tasks have different definitions of a dataset 

548 type. Different but compatible storage classes are permitted. 

549 MissingDatasetTypeError 

550 Raised if a dataset type definition is required to exist in the 

551 data repository but none was found. This should only occur for 

552 dataset types that are not produced by a task in the pipeline and 

553 are consumed with different storage classes or as components by 

554 tasks in the pipeline. 

555 EdgesChangedError 

556 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

557 change after import and reconfiguration. 

558 """ 

559 get_registered: Callable[[str], DatasetType | None] 

560 if registry is None: 

561 if dimensions is None or dataset_types is None: 

562 raise PipelineGraphError( 

563 "Either 'registry' or both 'dimensions' and 'dataset_types' " 

564 "must be passed to PipelineGraph.resolve." 

565 ) 

566 

567 else: 

568 if dimensions is None: 

569 dimensions = registry.dimensions 

570 

571 def get_registered(name: str) -> DatasetType | None: 

572 try: 

573 return registry.getDatasetType(name) 

574 except MissingDatasetTypeError: 

575 return None 

576 

577 if dataset_types is not None: 

578 # Ruff seems confused about whether this is used below; it is! 

579 get_registered = dataset_types.get 

580 node_key: NodeKey 

581 updates: dict[NodeKey, TaskNode | DatasetTypeNode] = {} 

582 for node_key, node_state in self._xgraph.nodes.items(): 

583 match node_key.node_type: 

584 case NodeType.TASK: 

585 task_node: TaskNode = node_state["instance"] 

586 new_task_node = task_node._resolved(dimensions) 

587 if new_task_node is not task_node: 

588 updates[node_key] = new_task_node 

589 case NodeType.DATASET_TYPE: 

590 dataset_type_node: DatasetTypeNode | None = node_state["instance"] 

591 new_dataset_type_node = DatasetTypeNode._from_edges( 

592 node_key, self._xgraph, get_registered, dimensions, previous=dataset_type_node 

593 ) 

594 # Usage of `is`` here is intentional; `_from_edges` returns 

595 # `previous=dataset_type_node` if it can determine that it 

596 # doesn't need to change. 

597 if new_dataset_type_node is not dataset_type_node: 

598 updates[node_key] = new_dataset_type_node 

599 try: 

600 for node_key, node_value in updates.items(): 

601 self._xgraph.nodes[node_key]["instance"] = node_value 

602 except Exception as err: # pragma: no cover 

603 # There's no known way to get here, but we want to make it 

604 # clear it's a big problem if we do. 

605 raise PipelineGraphExceptionSafetyError( 

606 "Error during dataset type resolution has left the graph in an inconsistent state." 

607 ) from err 

608 self.sort() 

609 self._universe = dimensions 

610 

611 ########################################################################### 

612 # 

613 # Graph Modification Interface: 

614 # 

615 # - methods to add, remove, and replace tasks; 

616 # 

617 # - methods to add and remove task subsets. 

618 # 

619 # These are all things that are usually done in a Pipeline before making a 

620 # graph at all, but there may be cases where we want to modify the graph 

621 # instead. (These are also the methods used to make a graph from a 

622 # Pipeline, or make a graph from another graph.) 

623 # 

624 ########################################################################### 

625 

626 def add_task( 

627 self, 

628 label: str | None, 

629 task_class: type[PipelineTask], 

630 config: PipelineTaskConfig | None = None, 

631 connections: PipelineTaskConnections | None = None, 

632 ) -> TaskNode: 

633 """Add a new task to the graph. 

634 

635 Parameters 

636 ---------- 

637 label : `str` or `None` 

638 Label for the task in the pipeline. If `None`, `Task._DefaultName` 

639 is used. 

640 task_class : `type` [ `PipelineTask` ] 

641 Class object for the task. 

642 config : `PipelineTaskConfig`, optional 

643 Configuration for the task. If not provided, a default-constructed 

644 instance of ``task_class.ConfigClass`` is used. 

645 connections : `PipelineTaskConnections`, optional 

646 Object that describes the dataset types used by the task. If not 

647 provided, one will be constructed from the given configuration. If 

648 provided, it is assumed that ``config`` has already been validated 

649 and frozen. 

650 

651 Returns 

652 ------- 

653 node : `TaskNode` 

654 The new task node added to the graph. 

655 

656 Raises 

657 ------ 

658 ValueError 

659 Raised if configuration validation failed when constructing 

660 ``connections``. 

661 PipelineDataCycleError 

662 Raised if the graph is cyclic after this addition. 

663 RuntimeError 

664 Raised if an unexpected exception (which will be chained) occurred 

665 at a stage that may have left the graph in an inconsistent state. 

666 Other exceptions should leave the graph unchanged. 

667 

668 Notes 

669 ----- 

670 Checks for dataset type consistency and multiple producers do not occur 

671 until `resolve` is called, since the resolution depends on both the 

672 state of the data repository and all contributing tasks. 

673 

674 Adding new tasks removes any existing resolutions of all dataset types 

675 it references and marks the graph as unsorted. It is most effiecient 

676 to add all tasks up front and only then resolve and/or sort the graph. 

677 """ 

678 if label is None: 

679 label = task_class._DefaultName 

680 if config is None: 

681 config = task_class.ConfigClass() 

682 task_node = TaskNode._from_imported_data( 

683 key=NodeKey(NodeType.TASK, label), 

684 init_key=NodeKey(NodeType.TASK_INIT, label), 

685 data=_TaskNodeImportedData.configure(label, task_class, config, connections), 

686 universe=self.universe, 

687 ) 

688 self.add_task_nodes([task_node]) 

689 return task_node 

690 

691 def add_task_nodes(self, nodes: Iterable[TaskNode], parent: PipelineGraph | None = None) -> None: 

692 """Add one or more existing task nodes to the graph. 

693 

694 Parameters 

695 ---------- 

696 nodes : `~collections.abc.Iterable` [ `TaskNode` ] 

697 Iterable of task nodes to add. If any tasks have resolved 

698 dimensions, they must have the same dimension universe as the rest 

699 of the graph. 

700 parent : `PipelineGraph`, optional 

701 If provided, another `PipelineGraph` from which these nodes were 

702 obtained. Any dataset type nodes already present in ``parent`` 

703 that are referenced by the given tasks will be used in this graph 

704 if they are not already present, preserving any dataset type 

705 resolutions present in the parent graph. Adding nodes from a 

706 parent graph after the graph has its own nodes (e.g. from 

707 `add_task`) or nodes from a third graph may result in invalid 

708 dataset type resolutions. It is safest to only use this argument 

709 when populating an empty graph for the first time. 

710 

711 Raises 

712 ------ 

713 PipelineDataCycleError 

714 Raised if the graph is cyclic after this addition. 

715 

716 Notes 

717 ----- 

718 Checks for dataset type consistency and multiple producers do not occur 

719 until `resolve` is called, since the resolution depends on both the 

720 state of the data repository and all contributing tasks. 

721 

722 Adding new tasks removes any existing resolutions of all dataset types 

723 it references (unless ``parent is not None`` and marks the graph as 

724 unsorted. It is most efficient to add all tasks up front and only then 

725 resolve and/or sort the graph. 

726 """ 

727 node_data: list[tuple[NodeKey, dict[str, Any]]] = [] 

728 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]] = [] 

729 for task_node in nodes: 

730 task_node = task_node._resolved(self._universe) 

731 node_data.append( 

732 (task_node.key, {"instance": task_node, "bipartite": task_node.key.node_type.bipartite}) 

733 ) 

734 node_data.append( 

735 ( 

736 task_node.init.key, 

737 {"instance": task_node.init, "bipartite": task_node.init.key.node_type.bipartite}, 

738 ) 

739 ) 

740 # Convert the edge objects attached to the task node to networkx. 

741 for read_edge in task_node.init.iter_all_inputs(): 

742 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

743 for write_edge in task_node.init.iter_all_outputs(): 

744 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

745 for read_edge in task_node.iter_all_inputs(): 

746 self._append_graph_data_from_edge(node_data, edge_data, read_edge, parent=parent) 

747 for write_edge in task_node.iter_all_outputs(): 

748 self._append_graph_data_from_edge(node_data, edge_data, write_edge, parent=parent) 

749 # Add a special edge (with no Edge instance) that connects the 

750 # TaskInitNode to the runtime TaskNode. 

751 edge_data.append((task_node.init.key, task_node.key, Edge.INIT_TO_TASK_NAME, {"instance": None})) 

752 if not node_data and not edge_data: 

753 return 

754 # Checks and preparation complete; time to start the actual 

755 # modification, during which it's hard to provide strong exception 

756 # safety. Start by resetting the sort ordering, if there is one. 

757 self._reset() 

758 try: 

759 self._xgraph.add_nodes_from(node_data) 

760 self._xgraph.add_edges_from(edge_data) 

761 if not networkx.algorithms.dag.is_directed_acyclic_graph(self._xgraph): 

762 cycle = networkx.find_cycle(self._xgraph) 

763 raise PipelineDataCycleError(f"Cycle detected while adding tasks: {cycle}.") 

764 except Exception: 

765 # First try to roll back our changes. 

766 try: 

767 self._xgraph.remove_edges_from(edge_data) 

768 self._xgraph.remove_nodes_from(key for key, _ in node_data) 

769 except Exception as err: # pragma: no cover 

770 # There's no known way to get here, but we want to make it 

771 # clear it's a big problem if we do. 

772 raise PipelineGraphExceptionSafetyError( 

773 "Error while attempting to revert PipelineGraph modification has left the graph in " 

774 "an inconsistent state." 

775 ) from err 

776 # Successfully rolled back; raise the original exception. 

777 raise 

778 

779 def reconfigure_tasks( 

780 self, 

781 *args: tuple[str, PipelineTaskConfig], 

782 check_edges_unchanged: bool = False, 

783 assume_edges_unchanged: bool = False, 

784 **kwargs: PipelineTaskConfig, 

785 ) -> None: 

786 """Update the configuration for one or more tasks. 

787 

788 Parameters 

789 ---------- 

790 *args : `tuple` [ `str`, `.PipelineTaskConfig` ] 

791 Positional arguments are each a 2-tuple of task label and new 

792 config object. Note that the same arguments may also be passed as 

793 ``**kwargs``, which is usually more readable, but task labels in 

794 ``*args`` are not required to be valid Python identifiers. 

795 check_edges_unchanged : `bool`, optional 

796 If `True`, require the edges (connections) of the modified tasks to 

797 remain unchanged after the configuration updates, and verify that 

798 this is the case. 

799 assume_edges_unchanged : `bool`, optional 

800 If `True`, the caller declares that the edges (connections) of the 

801 modified tasks will remain unchanged after the configuration 

802 updates, and that it is unnecessary to check this. 

803 **kwargs : `.PipelineTaskConfig` 

804 New config objects or overrides to apply to copies of the current 

805 config objects, with task labels as the keywords. 

806 

807 Raises 

808 ------ 

809 ValueError 

810 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

811 are both `True`, or if the same task appears twice. 

812 EdgesChangedError 

813 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

814 change. 

815 

816 Notes 

817 ----- 

818 If reconfiguring a task causes its edges to change, any dataset type 

819 nodes connected to that task (not just those whose edges have changed!) 

820 will be unresolved. 

821 """ 

822 new_configs: dict[str, PipelineTaskConfig] = {} 

823 for task_label, config_update in itertools.chain(args, kwargs.items()): 

824 if new_configs.setdefault(task_label, config_update) is not config_update: 

825 raise ValueError(f"Config for {task_label!r} provided more than once.") 

826 updates = { 

827 task_label: self.tasks[task_label]._reconfigured(config, rebuild=not assume_edges_unchanged) 

828 for task_label, config in new_configs.items() 

829 } 

830 self._replace_task_nodes( 

831 updates, 

832 check_edges_unchanged=check_edges_unchanged, 

833 assume_edges_unchanged=assume_edges_unchanged, 

834 message_header=( 

835 "Unexpected change in edges for task {task_label!r} from original config (A) to " 

836 "new configs (B):" 

837 ), 

838 ) 

839 

840 def remove_tasks( 

841 self, labels: Iterable[str], drop_from_subsets: bool = True 

842 ) -> list[tuple[TaskNode, set[str]]]: 

843 """Remove one or more tasks from the graph. 

844 

845 Parameters 

846 ---------- 

847 labels : `~collections.abc.Iterable` [ `str` ] 

848 Iterable of the labels of the tasks to remove. 

849 drop_from_subsets : `bool`, optional 

850 If `True`, drop each removed task from any subset in which it 

851 currently appears. If `False`, raise `PipelineGraphError` if any 

852 such subsets exist. 

853 

854 Returns 

855 ------- 

856 nodes_and_subsets : `list` [ `tuple` [ `TaskNode`, `set` [ `str` ] ] ] 

857 List of nodes removed and the labels of task subsets that 

858 referenced them. 

859 

860 Raises 

861 ------ 

862 PipelineGraphError 

863 Raised if ``drop_from_subsets`` is `False` and the task is still 

864 part of one or more subsets. 

865 

866 Notes 

867 ----- 

868 Removing a task will cause dataset nodes with no other referencing 

869 tasks to be removed. Any other dataset type nodes referenced by a 

870 removed task will be reset to an "unresolved" state. 

871 """ 

872 task_nodes_and_subsets = [] 

873 dataset_types: set[NodeKey] = set() 

874 nodes_to_remove = set() 

875 for label in labels: 

876 task_node: TaskNode = self._xgraph.nodes[NodeKey(NodeType.TASK, label)]["instance"] 

877 # Find task subsets that reference this task. 

878 referencing_subsets = { 

879 subset_label 

880 for subset_label, task_subset in self.task_subsets.items() 

881 if label in task_subset 

882 } 

883 if not drop_from_subsets and referencing_subsets: 

884 raise PipelineGraphError( 

885 f"Task {label!r} is still referenced by subset(s) {referencing_subsets}." 

886 ) 

887 task_nodes_and_subsets.append((task_node, referencing_subsets)) 

888 # Find dataset types referenced by this task. 

889 dataset_types.update(self._xgraph.predecessors(task_node.key)) 

890 dataset_types.update(self._xgraph.successors(task_node.key)) 

891 dataset_types.update(self._xgraph.predecessors(task_node.init.key)) 

892 dataset_types.update(self._xgraph.successors(task_node.init.key)) 

893 # Since there's an edge between the task and its init node, we'll 

894 # have added those two nodes here, too, and we don't want that. 

895 dataset_types.remove(task_node.init.key) 

896 dataset_types.remove(task_node.key) 

897 # Mark the task node and its init node for removal from the graph. 

898 nodes_to_remove.add(task_node.key) 

899 nodes_to_remove.add(task_node.init.key) 

900 # Process the referenced datasets to see which ones are orphaned and 

901 # need to be removed vs. just unresolved. 

902 nodes_to_unresolve = [] 

903 for dataset_type_key in dataset_types: 

904 related_tasks = set() 

905 related_tasks.update(self._xgraph.predecessors(dataset_type_key)) 

906 related_tasks.update(self._xgraph.successors(dataset_type_key)) 

907 related_tasks.difference_update(nodes_to_remove) 

908 if not related_tasks: 

909 nodes_to_remove.add(dataset_type_key) 

910 else: 

911 nodes_to_unresolve.append(dataset_type_key) 

912 # Checks and preparation complete; time to start the actual 

913 # modification, during which it's hard to provide strong exception 

914 # safety. Start by resetting the sort ordering. 

915 self._reset() 

916 try: 

917 for dataset_type_key in nodes_to_unresolve: 

918 self._xgraph.nodes[dataset_type_key]["instance"] = None 

919 for task_node, referencing_subsets in task_nodes_and_subsets: 

920 for subset_label in referencing_subsets: 

921 self._task_subsets[subset_label].remove(task_node.label) 

922 self._xgraph.remove_nodes_from(nodes_to_remove) 

923 except Exception as err: # pragma: no cover 

924 # There's no known way to get here, but we want to make it 

925 # clear it's a big problem if we do. 

926 raise PipelineGraphExceptionSafetyError( 

927 "Error during task removal has left the graph in an inconsistent state." 

928 ) from err 

929 return task_nodes_and_subsets 

930 

931 def add_task_subset(self, subset_label: str, task_labels: Iterable[str], description: str = "") -> None: 

932 """Add a label for a set of tasks that are already in the pipeline. 

933 

934 Parameters 

935 ---------- 

936 subset_label : `str` 

937 Label for this set of tasks. 

938 task_labels : `~collections.abc.Iterable` [ `str` ] 

939 Labels of the tasks to include in the set. All must already be 

940 included in the graph. 

941 description : `str`, optional 

942 String description to associate with this label. 

943 """ 

944 subset = TaskSubset(self._xgraph, subset_label, set(task_labels), description) 

945 self._task_subsets[subset_label] = subset 

946 

947 def remove_task_subset(self, subset_label: str) -> None: 

948 """Remove a labeled set of tasks. 

949 

950 Parameters 

951 ---------- 

952 subset_label : `str` 

953 Label for this set of tasks. 

954 """ 

955 del self._task_subsets[subset_label] 

956 

957 ########################################################################### 

958 # 

959 # NetworkX Export Interface: 

960 # 

961 # - methods to export the PipelineGraph's content (or various subsets 

962 # thereof) as NetworkX objects. 

963 # 

964 # These are particularly useful when writing tools to visualize the graph, 

965 # while providing options for which aspects of the graph (tasks, dataset 

966 # types, or both) to include, since all exported graphs have similar 

967 # attributes regardless of their structure. 

968 # 

969 ########################################################################### 

970 

971 def make_xgraph(self) -> networkx.MultiDiGraph: 

972 """Export a networkx representation of the full pipeline graph, 

973 including both init and runtime edges. 

974 

975 Returns 

976 ------- 

977 xgraph : `networkx.MultiDiGraph` 

978 Directed acyclic graph with parallel edges. 

979 

980 Notes 

981 ----- 

982 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

983 represent the same dataset type appearing in multiple connections for 

984 the same task, and are hence rare. The connection name is used as the 

985 edge key to disambiguate those parallel edges. 

986 

987 Almost all edges connect dataset type nodes to task or task init nodes 

988 or vice versa, but there is also a special edge that connects each task 

989 init node to its runtime node. The existence of these edges makes the 

990 graph not quite bipartite, though its init-only and runtime-only 

991 subgraphs are bipartite. 

992 

993 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

994 `WriteEdge` for the descriptive node and edge attributes added. 

995 """ 

996 return self._transform_xgraph_state(self._xgraph.copy(), skip_edges=False) 

997 

998 def make_bipartite_xgraph(self, init: bool = False) -> networkx.MultiDiGraph: 

999 """Return a bipartite networkx representation of just the runtime or 

1000 init-time pipeline graph. 

1001 

1002 Parameters 

1003 ---------- 

1004 init : `bool`, optional 

1005 If `True` (`False` is default) return the graph of task 

1006 initialization nodes and init input/output dataset types, instead 

1007 of the graph of runtime task nodes and regular 

1008 input/output/prerequisite dataset types. 

1009 

1010 Returns 

1011 ------- 

1012 xgraph : `networkx.MultiDiGraph` 

1013 Directed acyclic graph with parallel edges. 

1014 

1015 Notes 

1016 ----- 

1017 The returned graph uses `NodeKey` instances for nodes. Parallel edges 

1018 represent the same dataset type appearing in multiple connections for 

1019 the same task, and are hence rare. The connection name is used as the 

1020 edge key to disambiguate those parallel edges. 

1021 

1022 This graph is bipartite because each dataset type node only has edges 

1023 that connect it to a task [init] node, and vice versa. 

1024 

1025 See `TaskNode`, `TaskInitNode`, `DatasetTypeNode`, `ReadEdge`, and 

1026 `WriteEdge` for the descriptive node and edge attributes added. 

1027 """ 

1028 return self._transform_xgraph_state( 

1029 self._make_bipartite_xgraph_internal(init).copy(), skip_edges=False 

1030 ) 

1031 

1032 def make_task_xgraph(self, init: bool = False) -> networkx.DiGraph: 

1033 """Return a networkx representation of just the tasks in the pipeline. 

1034 

1035 Parameters 

1036 ---------- 

1037 init : `bool`, optional 

1038 If `True` (`False` is default) return the graph of task 

1039 initialization nodes, instead of the graph of runtime task nodes. 

1040 

1041 Returns 

1042 ------- 

1043 xgraph : `networkx.DiGraph` 

1044 Directed acyclic graph with no parallel edges. 

1045 

1046 Notes 

1047 ----- 

1048 The returned graph uses `NodeKey` instances for nodes. The dataset 

1049 types that link these tasks are not represented at all; edges have no 

1050 attributes, and there are no parallel edges. 

1051 

1052 See `TaskNode` and `TaskInitNode` for the descriptive node and 

1053 attributes added. 

1054 """ 

1055 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

1056 task_keys = [ 

1057 key 

1058 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1059 if bipartite == NodeType.TASK.bipartite 

1060 ] 

1061 return self._transform_xgraph_state( 

1062 networkx.algorithms.bipartite.projected_graph(networkx.DiGraph(bipartite_xgraph), task_keys), 

1063 skip_edges=True, 

1064 ) 

1065 

1066 def make_dataset_type_xgraph(self, init: bool = False) -> networkx.DiGraph: 

1067 """Return a networkx representation of just the dataset types in the 

1068 pipeline. 

1069 

1070 Parameters 

1071 ---------- 

1072 init : `bool`, optional 

1073 If `True` (`False` is default) return the graph of init input and 

1074 output dataset types, instead of the graph of runtime (input, 

1075 output, prerequisite input) dataset types. 

1076 

1077 Returns 

1078 ------- 

1079 xgraph : `networkx.DiGraph` 

1080 Directed acyclic graph with no parallel edges. 

1081 

1082 Notes 

1083 ----- 

1084 The returned graph uses `NodeKey` instances for nodes. The tasks that 

1085 link these tasks are not represented at all; edges have no attributes, 

1086 and there are no parallel edges. 

1087 

1088 See `DatasetTypeNode` for the descriptive node and attributes added. 

1089 """ 

1090 bipartite_xgraph = self._make_bipartite_xgraph_internal(init) 

1091 dataset_type_keys = [ 

1092 key 

1093 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1094 if bipartite == NodeType.DATASET_TYPE.bipartite 

1095 ] 

1096 return self._transform_xgraph_state( 

1097 networkx.algorithms.bipartite.projected_graph( 

1098 networkx.DiGraph(bipartite_xgraph), dataset_type_keys 

1099 ), 

1100 skip_edges=True, 

1101 ) 

1102 

1103 ########################################################################### 

1104 # 

1105 # Serialization Interface. 

1106 # 

1107 # Serialization of PipelineGraphs is currently experimental and may not be 

1108 # retained in the future. All serialization methods are 

1109 # underscore-prefixed to ensure nobody mistakes them for a stable interface 

1110 # (let a lone a stable file format). 

1111 # 

1112 ########################################################################### 

1113 

1114 @classmethod 

1115 def _read_stream( 

1116 cls, stream: BinaryIO, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1117 ) -> PipelineGraph: 

1118 """Read a serialized `PipelineGraph` from a file-like object. 

1119 

1120 Parameters 

1121 ---------- 

1122 stream : `BinaryIO` 

1123 File-like object opened for binary reading, containing 

1124 gzip-compressed JSON. 

1125 import_mode : `TaskImportMode`, optional 

1126 Whether to import tasks, and how to reconcile any differences 

1127 between the imported task's connections and the those that were 

1128 persisted with the graph. Default is to check that they are the 

1129 same. 

1130 

1131 Returns 

1132 ------- 

1133 graph : `PipelineGraph` 

1134 Deserialized pipeline graph. 

1135 

1136 Raises 

1137 ------ 

1138 PipelineGraphReadError 

1139 Raised if the serialized `PipelineGraph` is not self-consistent. 

1140 EdgesChangedError 

1141 Raised if ``import_mode`` is 

1142 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1143 did change after import and reconfiguration. 

1144 

1145 Notes 

1146 ----- 

1147 `PipelineGraph` serialization is currently experimental and may be 

1148 removed or significantly changed in the future, with no deprecation 

1149 period. 

1150 """ 

1151 from .io import SerializedPipelineGraph 

1152 

1153 with gzip.open(stream, "rb") as uncompressed_stream: 

1154 data = json.load(uncompressed_stream) 

1155 serialized_graph = SerializedPipelineGraph.model_validate(data) 

1156 return serialized_graph.deserialize(import_mode) 

1157 

1158 @classmethod 

1159 def _read_uri( 

1160 cls, 

1161 uri: ResourcePathExpression, 

1162 import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES, 

1163 ) -> PipelineGraph: 

1164 """Read a serialized `PipelineGraph` from a file at a URI. 

1165 

1166 Parameters 

1167 ---------- 

1168 uri : convertible to `lsst.resources.ResourcePath` 

1169 URI to a gzip-compressed JSON file containing a serialized pipeline 

1170 graph. 

1171 import_mode : `TaskImportMode`, optional 

1172 Whether to import tasks, and how to reconcile any differences 

1173 between the imported task's connections and the those that were 

1174 persisted with the graph. Default is to check that they are the 

1175 same. 

1176 

1177 Returns 

1178 ------- 

1179 graph : `PipelineGraph` 

1180 Deserialized pipeline graph. 

1181 

1182 Raises 

1183 ------ 

1184 PipelineGraphReadError 

1185 Raised if the serialized `PipelineGraph` is not self-consistent. 

1186 EdgesChangedError 

1187 Raised if ``import_mode`` is 

1188 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1189 did change after import and reconfiguration. 

1190 

1191 Notes 

1192 ----- 

1193 `PipelineGraph` serialization is currently experimental and may be 

1194 removed or significantly changed in the future, with no deprecation 

1195 period. 

1196 """ 

1197 uri = ResourcePath(uri) 

1198 with uri.open("rb") as stream: 

1199 return cls._read_stream(cast(BinaryIO, stream), import_mode=import_mode) 

1200 

1201 def _write_stream(self, stream: BinaryIO) -> None: 

1202 """Write the pipeline to a file-like object. 

1203 

1204 Parameters 

1205 ---------- 

1206 stream 

1207 File-like object opened for binary writing. 

1208 

1209 Notes 

1210 ----- 

1211 `PipelineGraph` serialization is currently experimental and may be 

1212 removed or significantly changed in the future, with no deprecation 

1213 period. 

1214 

1215 The file format is gzipped JSON, and is intended to be human-readable, 

1216 but it should not be considered a stable public interface for outside 

1217 code, which should always use `PipelineGraph` methods (or at least the 

1218 `io.SerializedPipelineGraph` class) to read these files. 

1219 """ 

1220 from .io import SerializedPipelineGraph 

1221 

1222 with gzip.open(stream, mode="wb") as compressed_stream: 

1223 compressed_stream.write( 

1224 SerializedPipelineGraph.serialize(self).model_dump_json(exclude_defaults=True).encode("utf-8") 

1225 ) 

1226 

1227 def _write_uri(self, uri: ResourcePathExpression) -> None: 

1228 """Write the pipeline to a file given a URI. 

1229 

1230 Parameters 

1231 ---------- 

1232 uri : convertible to `lsst.resources.ResourcePath` 

1233 URI to write to . May have ``.json.gz`` or no extension (which 

1234 will cause a ``.json.gz`` extension to be added). 

1235 

1236 Notes 

1237 ----- 

1238 `PipelineGraph` serialization is currently experimental and may be 

1239 removed or significantly changed in the future, with no deprecation 

1240 period. 

1241 

1242 The file format is gzipped JSON, and is intended to be human-readable, 

1243 but it should not be considered a stable public interface for outside 

1244 code, which should always use `PipelineGraph` methods (or at least the 

1245 `io.SerializedPipelineGraph` class) to read these files. 

1246 """ 

1247 uri = ResourcePath(uri) 

1248 extension = uri.getExtension() 

1249 if not extension: 

1250 uri = uri.updatedExtension(".json.gz") 

1251 elif extension != ".json.gz": 

1252 raise ValueError("Expanded pipeline files should always have a .json.gz extension.") 

1253 with uri.open(mode="wb") as stream: 

1254 self._write_stream(cast(BinaryIO, stream)) 

1255 

1256 def _import_and_configure( 

1257 self, import_mode: TaskImportMode = TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1258 ) -> None: 

1259 """Import the `PipelineTask` classes referenced by all task nodes and 

1260 update those nodes accordingly. 

1261 

1262 Parameters 

1263 ---------- 

1264 import_mode : `TaskImportMode`, optional 

1265 Whether to import tasks, and how to reconcile any differences 

1266 between the imported task's connections and the those that were 

1267 persisted with the graph. Default is to check that they are the 

1268 same. This method does nothing if this is 

1269 `TaskImportMode.DO_NOT_IMPORT`. 

1270 

1271 Raises 

1272 ------ 

1273 EdgesChangedError 

1274 Raised if ``import_mode`` is 

1275 `TaskImportMode.REQUIRED_CONSISTENT_EDGES` and the edges of a task 

1276 did change after import and reconfiguration. 

1277 

1278 Notes 

1279 ----- 

1280 This method shouldn't need to be called unless the graph was 

1281 deserialized without importing and configuring immediately, which is 

1282 not the default behavior (but it can greatly speed up deserialization). 

1283 If all tasks have already been imported this does nothing. 

1284 

1285 Importing and configuring a task can change its 

1286 `~TaskNode.task_class_name` or `~TaskClass.get_config_str` output, 

1287 usually because the software used to read a serialized graph is newer 

1288 than the software used to write it (e.g. a new config option has been 

1289 added, or the task was moved to a new module with a forwarding alias 

1290 left behind). These changes are allowed by 

1291 `TaskImportMode.REQUIRE_CONSISTENT_EDGES`. 

1292 

1293 If importing and configuring a task causes its edges to change, any 

1294 dataset type nodes linked to those edges will be reset to the 

1295 unresolved state. 

1296 """ 

1297 if import_mode is TaskImportMode.DO_NOT_IMPORT: 

1298 return 

1299 rebuild = ( 

1300 import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES 

1301 or import_mode is TaskImportMode.OVERRIDE_EDGES 

1302 ) 

1303 updates: dict[str, TaskNode] = {} 

1304 node_key: NodeKey 

1305 for node_key, node_state in self._xgraph.nodes.items(): 

1306 if node_key.node_type is NodeType.TASK: 

1307 task_node: TaskNode = node_state["instance"] 

1308 new_task_node = task_node._imported_and_configured(rebuild) 

1309 if new_task_node is not task_node: 

1310 updates[task_node.label] = new_task_node 

1311 self._replace_task_nodes( 

1312 updates, 

1313 check_edges_unchanged=(import_mode is TaskImportMode.REQUIRE_CONSISTENT_EDGES), 

1314 assume_edges_unchanged=(import_mode is TaskImportMode.ASSUME_CONSISTENT_EDGES), 

1315 message_header=( 

1316 "In task with label {task_label!r}, persisted edges (A)" 

1317 "differ from imported and configured edges (B):" 

1318 ), 

1319 ) 

1320 

1321 ########################################################################### 

1322 # 

1323 # Advanced PipelineGraph Inspection Interface: 

1324 # 

1325 # - methods to iterate over all nodes and edges, utilizing NodeKeys; 

1326 # 

1327 # - methods to find overall inputs and group nodes by their dimensions, 

1328 # which are important operations for QuantumGraph generation. 

1329 # 

1330 ########################################################################### 

1331 

1332 def iter_edges(self, init: bool = False) -> Iterator[Edge]: 

1333 """Iterate over edges in the graph. 

1334 

1335 Parameters 

1336 ---------- 

1337 init : `bool`, optional 

1338 If `True` (`False` is default) iterate over the edges between task 

1339 initialization node and init input/output dataset types, instead of 

1340 the runtime task nodes and regular input/output/prerequisite 

1341 dataset types. 

1342 

1343 Returns 

1344 ------- 

1345 edges : `~collections.abc.Iterator` [ `Edge` ] 

1346 A lazy iterator over `Edge` (`WriteEdge` or `ReadEdge`) instances. 

1347 

1348 Notes 

1349 ----- 

1350 This method always returns _either_ init edges or runtime edges, never 

1351 both. The full (internal) graph that contains both also includes a 

1352 special edge that connects each task init node to its runtime node; 

1353 that is also never returned by this method, since it is never a part of 

1354 the init-only or runtime-only subgraphs. 

1355 """ 

1356 edge: Edge 

1357 for _, _, edge in self._xgraph.edges(data="instance"): 

1358 if edge is not None and edge.is_init == init: 

1359 yield edge 

1360 

1361 def iter_nodes( 

1362 self, 

1363 ) -> Iterator[ 

1364 tuple[Literal[NodeType.TASK_INIT], str, TaskInitNode] 

1365 | tuple[Literal[NodeType.TASK], str, TaskInitNode] 

1366 | tuple[Literal[NodeType.DATASET_TYPE], str, DatasetTypeNode | None] 

1367 ]: 

1368 """Iterate over nodes in the graph. 

1369 

1370 Returns 

1371 ------- 

1372 nodes : `~collections.abc.Iterator` [ `tuple` ] 

1373 A lazy iterator over all of the nodes in the graph. Each yielded 

1374 element is a tuple of: 

1375 

1376 - the node type enum value (`NodeType`); 

1377 - the string name for the node (task label or parent dataset type 

1378 name); 

1379 - the node value (`TaskNode`, `TaskInitNode`, `DatasetTypeNode`, 

1380 or `None` for dataset type nodes that have not been resolved). 

1381 """ 

1382 key: NodeKey 

1383 if self._sorted_keys is not None: 

1384 for key in self._sorted_keys: 

1385 yield key.node_type, key.name, self._xgraph.nodes[key]["instance"] # type: ignore 

1386 else: 

1387 for key, node in self._xgraph.nodes(data="instance"): 

1388 yield key.node_type, key.name, node # type: ignore 

1389 

1390 def iter_overall_inputs(self) -> Iterator[tuple[str, DatasetTypeNode | None]]: 

1391 """Iterate over all of the dataset types that are consumed but not 

1392 produced by the graph. 

1393 

1394 Returns 

1395 ------- 

1396 dataset_types : `~collections.abc.Iterator` [ `tuple` ] 

1397 A lazy iterator over the overall-input dataset types (including 

1398 overall init inputs and prerequisites). Each yielded element is a 

1399 tuple of: 

1400 

1401 - the parent dataset type name; 

1402 - the resolved `DatasetTypeNode`, or `None` if the dataset type has 

1403 - not been resolved. 

1404 """ 

1405 for generation in networkx.algorithms.dag.topological_generations(self._xgraph): 

1406 key: NodeKey 

1407 for key in generation: 

1408 # While we expect all tasks to have at least one input and 

1409 # hence never appear in the first topological generation, that 

1410 # is not true of task init nodes. 

1411 if key.node_type is NodeType.DATASET_TYPE: 

1412 yield key.name, self._xgraph.nodes[key]["instance"] 

1413 return 

1414 

1415 def group_by_dimensions( 

1416 self, prerequisites: bool = False 

1417 ) -> dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]]: 

1418 """Group this graph's tasks and dataset types by their dimensions. 

1419 

1420 Parameters 

1421 ---------- 

1422 prerequisites : `bool`, optional 

1423 If `True`, include prerequisite dataset types as well as regular 

1424 input and output datasets (including intermediates). 

1425 

1426 Returns 

1427 ------- 

1428 groups : `dict` [ `DimensionGroup`, `tuple` ] 

1429 A dictionary of groups keyed by `DimensionGroup`, in which each 

1430 value is a tuple of: 

1431 

1432 - a `dict` of `TaskNode` instances, keyed by task label 

1433 - a `dict` of `DatasetTypeNode` instances, keyed by 

1434 dataset type name. 

1435 

1436 that have those dimensions. 

1437 

1438 Notes 

1439 ----- 

1440 Init inputs and outputs are always included, but always have empty 

1441 dimensions and are hence are all grouped together. 

1442 """ 

1443 result: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = {} 

1444 next_new_value: tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] = ({}, {}) 

1445 for task_label, task_node in self.tasks.items(): 

1446 if task_node.dimensions is None: 

1447 raise UnresolvedGraphError(f"Task with label {task_label!r} has not been resolved.") 

1448 if (group := result.setdefault(task_node.dimensions, next_new_value)) is next_new_value: 

1449 next_new_value = ({}, {}) # make new lists for next time 

1450 group[0][task_node.label] = task_node 

1451 for dataset_type_name, dataset_type_node in self.dataset_types.items(): 

1452 if dataset_type_node is None: 

1453 raise UnresolvedGraphError(f"Dataset type {dataset_type_name!r} has not been resolved.") 

1454 if not dataset_type_node.is_prerequisite or prerequisites: 

1455 if ( 

1456 group := result.setdefault( 

1457 dataset_type_node.dataset_type.dimensions.as_group(), next_new_value 

1458 ) 

1459 ) is next_new_value: 

1460 next_new_value = ({}, {}) # make new lists for next time 

1461 group[1][dataset_type_node.name] = dataset_type_node 

1462 return result 

1463 

1464 def split_independent(self) -> Iterable[PipelineGraph]: 

1465 """Iterate over independent subgraphs that together comprise this 

1466 pipeline graph. 

1467 

1468 Returns 

1469 ------- 

1470 subgraphs : `Iterable` [ `PipelineGraph` ] 

1471 An iterable over component subgraphs that could be run 

1472 independently (they have only overall inputs in common). May be a 

1473 lazy iterator. 

1474 

1475 Notes 

1476 ----- 

1477 All resolved dataset type nodes will be preserved. 

1478 

1479 If there is only one component, ``self`` may be returned as the only 

1480 element in the iterable. 

1481 

1482 If `has_been_sorted`, all subgraphs will be sorted as well. 

1483 """ 

1484 # Having an overall input in common isn't enough to make subgraphs 

1485 # dependent on each other, so we want to look for connected component 

1486 # subgraphs of the task-only projected graph. 

1487 bipartite_xgraph = self._make_bipartite_xgraph_internal(init=False) 

1488 task_keys = { 

1489 key 

1490 for key, bipartite in bipartite_xgraph.nodes(data="bipartite") 

1491 if bipartite == NodeType.TASK.bipartite 

1492 } 

1493 task_xgraph = networkx.algorithms.bipartite.projected_graph( 

1494 networkx.DiGraph(bipartite_xgraph), task_keys 

1495 ) 

1496 # "Weakly" connected means connected in only one direction, which is 

1497 # the only kind of "connected" a DAG can ever be. 

1498 for component_task_keys in networkx.algorithms.weakly_connected_components(task_xgraph): 

1499 if component_task_keys == task_keys: 

1500 yield self 

1501 return 

1502 else: 

1503 component_subgraph = PipelineGraph(universe=self._universe) 

1504 component_subgraph.add_task_nodes( 

1505 [self._xgraph.nodes[key]["instance"] for key in component_task_keys], parent=self 

1506 ) 

1507 if self.has_been_sorted: 

1508 component_subgraph.sort() 

1509 yield component_subgraph 

1510 

1511 ########################################################################### 

1512 # 

1513 # Class- and Package-Private Methods. 

1514 # 

1515 ########################################################################### 

1516 

1517 def _iter_task_defs(self) -> Iterator[TaskDef]: 

1518 """Iterate over this pipeline as a sequence of `TaskDef` instances. 

1519 

1520 Notes 

1521 ----- 

1522 This is a package-private method intended to aid in the transition to a 

1523 codebase more fully integrated with the `PipelineGraph` class, in which 

1524 both `TaskDef` and `PipelineDatasetTypes` are expected to go away, and 

1525 much of the functionality on the `Pipeline` class will be moved to 

1526 `PipelineGraph` as well. 

1527 

1528 Raises 

1529 ------ 

1530 TaskNotImportedError 

1531 Raised if `TaskNode.is_imported` is `False` for any task. 

1532 """ 

1533 from ..pipeline import TaskDef 

1534 

1535 for node in self._tasks.values(): 

1536 yield TaskDef( 

1537 config=node.config, 

1538 taskClass=node.task_class, 

1539 label=node.label, 

1540 connections=node.get_connections(), 

1541 ) 

1542 

1543 def _init_from_args( 

1544 self, 

1545 xgraph: networkx.MultiDiGraph | None, 

1546 sorted_keys: Sequence[NodeKey] | None, 

1547 task_subsets: dict[str, TaskSubset] | None, 

1548 description: str, 

1549 universe: DimensionUniverse | None, 

1550 data_id: DataId | None, 

1551 ) -> None: 

1552 """Initialize the graph with possibly-nontrivial arguments. 

1553 

1554 Parameters 

1555 ---------- 

1556 xgraph : `networkx.MultiDiGraph` or `None` 

1557 The backing networkx graph, or `None` to create an empty one. 

1558 This graph has `NodeKey` instances for nodes and the same structure 

1559 as the graph exported by `make_xgraph`, but its nodes and edges 

1560 have a single ``instance`` attribute that holds a `TaskNode`, 

1561 `TaskInitNode`, `DatasetTypeNode` (or `None`), `ReadEdge`, or 

1562 `WriteEdge` instance. 

1563 sorted_keys : `Sequence` [ `NodeKey` ] or `None` 

1564 Topologically sorted sequence of node keys, or `None` if the graph 

1565 is not sorted. 

1566 task_subsets : `dict` [ `str`, `TaskSubset` ] 

1567 Labeled subsets of tasks. Values must be constructed with 

1568 ``xgraph`` as their parent graph. 

1569 description : `str` 

1570 String description for this pipeline. 

1571 universe : `lsst.daf.butler.DimensionUniverse` or `None` 

1572 Definitions of all dimensions. 

1573 data_id : `lsst.daf.butler.DataCoordinate` or other data ID mapping. 

1574 Data ID that represents a constraint on all quanta generated from 

1575 this pipeline. 

1576 

1577 Notes 

1578 ----- 

1579 Only empty `PipelineGraph` instances should be constructed directly by 

1580 users, which sets the signature of ``__init__`` itself, but methods on 

1581 `PipelineGraph` and its helper classes need to be able to create them 

1582 with state. Those methods can call this after calling ``__new__`` 

1583 manually, skipping ``__init__``. 

1584 """ 

1585 self._xgraph = xgraph if xgraph is not None else networkx.MultiDiGraph() 

1586 self._sorted_keys: Sequence[NodeKey] | None = None 

1587 self._task_subsets = task_subsets if task_subsets is not None else {} 

1588 self._description = description 

1589 self._tasks = TaskMappingView(self._xgraph) 

1590 self._dataset_types = DatasetTypeMappingView(self._xgraph) 

1591 self._raw_data_id: dict[str, Any] 

1592 if isinstance(data_id, DataCoordinate): 

1593 if universe is None: 

1594 universe = data_id.universe 

1595 else: 

1596 assert universe is data_id.universe, "data_id.universe and given universe differ" 

1597 self._raw_data_id = dict(data_id.required) 

1598 elif data_id is None: 

1599 self._raw_data_id = {} 

1600 else: 

1601 self._raw_data_id = dict(data_id) 

1602 self._universe = universe 

1603 if sorted_keys is not None: 

1604 self._reorder(sorted_keys) 

1605 

1606 def _make_bipartite_xgraph_internal(self, init: bool) -> networkx.MultiDiGraph: 

1607 """Make a bipartite init-only or runtime-only internal subgraph. 

1608 

1609 See `make_bipartite_xgraph` for parameters and return values. 

1610 

1611 Notes 

1612 ----- 

1613 This method returns a view of the `PipelineGraph` object's internal 

1614 backing graph, and hence should only be called in methods that copy the 

1615 result either explicitly or by running a copying algorithm before 

1616 returning it to the user. 

1617 """ 

1618 return self._xgraph.edge_subgraph([edge.key for edge in self.iter_edges(init)]) 

1619 

1620 def _transform_xgraph_state(self, xgraph: _G, skip_edges: bool) -> _G: 

1621 """Transform networkx graph attributes in-place from the internal 

1622 "instance" attributes to the documented exported attributes. 

1623 

1624 Parameters 

1625 ---------- 

1626 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1627 Graph whose state should be transformed. 

1628 skip_edges : `bool` 

1629 If `True`, do not transform edge state. 

1630 

1631 Returns 

1632 ------- 

1633 xgraph : `networkx.DiGraph` or `networkx.MultiDiGraph` 

1634 The same object passed in, after modification. 

1635 

1636 Notes 

1637 ----- 

1638 This should be called after making a copy of the internal graph but 

1639 before any projection down to just task or dataset type nodes, since 

1640 it assumes stateful edges. 

1641 """ 

1642 state: dict[str, Any] 

1643 for state in xgraph.nodes.values(): 

1644 node_value: TaskInitNode | TaskNode | DatasetTypeNode | None = state.pop("instance") 

1645 if node_value is not None: 

1646 state.update(node_value._to_xgraph_state()) 

1647 else: 

1648 # This is a dataset type node that is not resolved. 

1649 state["bipartite"] = NodeType.DATASET_TYPE.bipartite 

1650 if not skip_edges: 

1651 for _, _, state in xgraph.edges(data=True): 

1652 edge: Edge | None = state.pop("instance", None) 

1653 if edge is not None: 

1654 state.update(edge._to_xgraph_state()) 

1655 return xgraph 

1656 

1657 def _replace_task_nodes( 

1658 self, 

1659 updates: Mapping[str, TaskNode], 

1660 check_edges_unchanged: bool, 

1661 assume_edges_unchanged: bool, 

1662 message_header: str, 

1663 ) -> None: 

1664 """Replace task nodes and update edges and dataset type nodes 

1665 accordingly. 

1666 

1667 Parameters 

1668 ---------- 

1669 updates : `Mapping` [ `str`, `TaskNode` ] 

1670 New task nodes with task label keys. All keys must be task labels 

1671 that are already present in the graph. 

1672 check_edges_unchanged : `bool`, optional 

1673 If `True`, require the edges (connections) of the modified tasks to 

1674 remain unchanged after importing and configuring each task, and 

1675 verify that this is the case. 

1676 assume_edges_unchanged : `bool`, optional 

1677 If `True`, the caller declares that the edges (connections) of the 

1678 modified tasks will remain unchanged importing and configuring each 

1679 task, and that it is unnecessary to check this. 

1680 message_header : `str` 

1681 Template for `str.format` with a single ``task_label`` placeholder 

1682 to use as the first line in `EdgesChangedError` messages that show 

1683 the differences between new task edges and old task edges. Should 

1684 include the fact that the rest of the message will refer to the old 

1685 task as "A" and the new task as "B", and end with a colon. 

1686 

1687 Raises 

1688 ------ 

1689 ValueError 

1690 Raised if ``assume_edges_unchanged`` and ``check_edges_unchanged`` 

1691 are both `True`, or if a full config is provided for a task after 

1692 another full config or an override has already been provided. 

1693 EdgesChangedError 

1694 Raised if ``check_edges_unchanged=True`` and the edges of a task do 

1695 change. 

1696 """ 

1697 deep: dict[str, TaskNode] = {} 

1698 shallow: dict[str, TaskNode] = {} 

1699 if assume_edges_unchanged: 

1700 if check_edges_unchanged: 

1701 raise ValueError("Cannot simultaneously assume and check that edges have not changed.") 

1702 shallow.update(updates) 

1703 else: 

1704 for task_label, new_task_node in updates.items(): 

1705 old_task_node = self.tasks[task_label] 

1706 messages = old_task_node.diff_edges(new_task_node) 

1707 if messages: 

1708 if check_edges_unchanged: 

1709 messages.insert(0, message_header.format(task_label=task_label)) 

1710 raise EdgesChangedError("\n".join(messages)) 

1711 else: 

1712 deep[task_label] = new_task_node 

1713 else: 

1714 shallow[task_label] = new_task_node 

1715 try: 

1716 if deep: 

1717 removed = self.remove_tasks(deep.keys(), drop_from_subsets=True) 

1718 self.add_task_nodes(deep.values()) 

1719 for replaced_task_node, referencing_subsets in removed: 

1720 for subset_label in referencing_subsets: 

1721 self._task_subsets[subset_label].add(replaced_task_node.label) 

1722 for task_node in shallow.values(): 

1723 self._xgraph.nodes[task_node.key]["instance"] = task_node 

1724 self._xgraph.nodes[task_node.init.key]["instance"] = task_node.init 

1725 except PipelineGraphExceptionSafetyError: # pragma: no cover 

1726 raise 

1727 except Exception as err: # pragma: no cover 

1728 # There's no known way to get here, but we want to make it clear 

1729 # it's a big problem if we do. 

1730 raise PipelineGraphExceptionSafetyError( 

1731 "Error while replacing tasks has left the graph in an inconsistent state." 

1732 ) from err 

1733 

1734 def _append_graph_data_from_edge( 

1735 self, 

1736 node_data: list[tuple[NodeKey, dict[str, Any]]], 

1737 edge_data: list[tuple[NodeKey, NodeKey, str, dict[str, Any]]], 

1738 edge: Edge, 

1739 parent: PipelineGraph | None, 

1740 ) -> None: 

1741 """Append networkx state dictionaries for an edge and the corresponding 

1742 dataset type node. 

1743 

1744 Parameters 

1745 ---------- 

1746 node_data : `list` 

1747 List of node keys and state dictionaries. A node is appended if 

1748 one does not already exist for this dataset type. 

1749 edge_data : `list` 

1750 List of node key pairs, connection names, and state dictionaries 

1751 for edges. 

1752 edge : `Edge` 

1753 New edge being processed. 

1754 parent : `PipelineGraph` or `None` 

1755 Another pipeline graph whose dataset type nodes should be used 

1756 when present. 

1757 """ 

1758 new_dataset_type_node = None 

1759 if parent is not None: 

1760 new_dataset_type_node = parent._xgraph.nodes[edge.dataset_type_key].get("instance") 

1761 if (existing_dataset_type_state := self._xgraph.nodes.get(edge.dataset_type_key)) is not None: 

1762 existing_dataset_type_state["instance"] = new_dataset_type_node 

1763 else: 

1764 node_data.append( 

1765 ( 

1766 edge.dataset_type_key, 

1767 { 

1768 "instance": new_dataset_type_node, 

1769 "bipartite": NodeType.DATASET_TYPE.bipartite, 

1770 }, 

1771 ) 

1772 ) 

1773 edge_data.append( 

1774 edge.nodes 

1775 + ( 

1776 edge.connection_name, 

1777 {"instance": edge}, 

1778 ) 

1779 ) 

1780 

1781 def _reorder(self, sorted_keys: Sequence[NodeKey]) -> None: 

1782 """Set the order of all views of this graph from the given sorted 

1783 sequence of task labels and dataset type names. 

1784 """ 

1785 self._sorted_keys = sorted_keys 

1786 self._tasks._reorder(sorted_keys) 

1787 self._dataset_types._reorder(sorted_keys) 

1788 

1789 def _reset(self) -> None: 

1790 """Reset the all views of this graph following a modification that 

1791 might invalidate them. 

1792 """ 

1793 self._sorted_keys = None 

1794 self._tasks._reset() 

1795 self._dataset_types._reset() 

1796 

1797 _xgraph: networkx.MultiDiGraph 

1798 _sorted_keys: Sequence[NodeKey] | None 

1799 _task_subsets: dict[str, TaskSubset] 

1800 _description: str 

1801 _tasks: TaskMappingView 

1802 _dataset_types: DatasetTypeMappingView 

1803 _raw_data_id: dict[str, Any] 

1804 _universe: DimensionUniverse | None