Coverage for python/lsst/pipe/base/pipeline_graph/_edges.py: 39%

198 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-30 02:55 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("Edge", "ReadEdge", "WriteEdge") 

30 

31from abc import ABC, abstractmethod 

32from collections.abc import Callable, Mapping, Sequence 

33from typing import Any, ClassVar, Self, TypeVar 

34 

35from lsst.daf.butler import DatasetRef, DatasetType, DimensionUniverse 

36from lsst.daf.butler.registry import MissingDatasetTypeError 

37from lsst.utils.classes import immutable 

38 

39from ..connectionTypes import BaseConnection 

40from ._exceptions import ConnectionTypeConsistencyError, IncompatibleDatasetTypeError 

41from ._nodes import NodeKey, NodeType 

42 

43_S = TypeVar("_S", bound="Edge") 

44 

45 

46@immutable 

47class Edge(ABC): 

48 """Base class for edges in a pipeline graph. 

49 

50 This represents the link between a task node and an input or output dataset 

51 type. 

52 

53 Parameters 

54 ---------- 

55 task_key : `NodeKey` 

56 Key for the task node this edge is connected to. 

57 dataset_type_key : `NodeKey` 

58 Key for the dataset type node this edge is connected to. 

59 storage_class_name : `str` 

60 Name of the dataset type's storage class as seen by the task. 

61 connection_name : `str` 

62 Internal name for the connection as seen by the task. 

63 is_calibration : `bool` 

64 Whether this dataset type can be included in 

65 `~lsst.daf.butler.CollectionType.CALIBRATION` collections. 

66 raw_dimensions : `frozenset` [ `str` ] 

67 Raw dimensions from the connection definition. 

68 """ 

69 

70 def __init__( 

71 self, 

72 *, 

73 task_key: NodeKey, 

74 dataset_type_key: NodeKey, 

75 storage_class_name: str, 

76 connection_name: str, 

77 is_calibration: bool, 

78 raw_dimensions: frozenset[str], 

79 ): 

80 self.task_key = task_key 

81 self.dataset_type_key = dataset_type_key 

82 self.connection_name = connection_name 

83 self.storage_class_name = storage_class_name 

84 self.is_calibration = is_calibration 

85 self.raw_dimensions = raw_dimensions 

86 

87 INIT_TO_TASK_NAME: ClassVar[str] = "INIT" 

88 """Edge key for the special edge that connects a task init node to the 

89 task node itself (for regular edges, this would be the connection name). 

90 """ 

91 

92 task_key: NodeKey 

93 """Task part of the key for this edge in networkx graphs.""" 

94 

95 dataset_type_key: NodeKey 

96 """Task part of the key for this edge in networkx graphs.""" 

97 

98 connection_name: str 

99 """Name used by the task to refer to this dataset type.""" 

100 

101 storage_class_name: str 

102 """Storage class expected by this task. 

103 

104 If `ReadEdge.component` is not `None`, this is the component storage class, 

105 not the parent storage class. 

106 """ 

107 

108 is_calibration: bool 

109 """Whether this dataset type can be included in 

110 `~lsst.daf.butler.CollectionType.CALIBRATION` collections. 

111 """ 

112 

113 raw_dimensions: frozenset[str] 

114 """Raw dimensions in the task declaration. 

115 

116 This can only be used safely for partial comparisons: two edges with the 

117 same ``raw_dimensions`` (and the same parent dataset type name) always have 

118 the same resolved dimensions, but edges with different ``raw_dimensions`` 

119 may also have the same resolvd dimensions. 

120 """ 

121 

122 @property 

123 def is_init(self) -> bool: 

124 """Whether this dataset is read or written when the task is 

125 constructed, not when it is run. 

126 """ 

127 return self.task_key.node_type is NodeType.TASK_INIT 

128 

129 @property 

130 def task_label(self) -> str: 

131 """Label of the task.""" 

132 return str(self.task_key) 

133 

134 @property 

135 def parent_dataset_type_name(self) -> str: 

136 """Name of the parent dataset type. 

137 

138 All dataset type nodes in a pipeline graph are for parent dataset 

139 types; components are represented by additional `ReadEdge` state. 

140 """ 

141 return str(self.dataset_type_key) 

142 

143 @property 

144 @abstractmethod 

145 def nodes(self) -> tuple[NodeKey, NodeKey]: 

146 """The directed pair of `NodeKey` instances this edge connects. 

147 

148 This tuple is ordered in the same direction as the pipeline flow: 

149 `task_key` precedes `dataset_type_key` for writes, and the 

150 reverse is true for reads. 

151 """ 

152 raise NotImplementedError() 

153 

154 @property 

155 def key(self) -> tuple[NodeKey, NodeKey, str]: 

156 """Ordered tuple of node keys and connection name that uniquely 

157 identifies this edge in a pipeline graph. 

158 """ 

159 return self.nodes + (self.connection_name,) 

160 

161 def __repr__(self) -> str: 

162 return f"{self.nodes[0]} -> {self.nodes[1]} ({self.connection_name})" 

163 

164 @property 

165 def dataset_type_name(self) -> str: 

166 """Dataset type name seen by the task. 

167 

168 This defaults to the parent dataset type name, which is appropriate 

169 for all writes and most reads. 

170 """ 

171 return self.parent_dataset_type_name 

172 

173 def diff(self: _S, other: _S, connection_type: str = "connection") -> list[str]: 

174 """Compare this edge to another one from a possibly-different 

175 configuration of the same task label. 

176 

177 Parameters 

178 ---------- 

179 other : `Edge` 

180 Another edge of the same type to compare to. 

181 connection_type : `str` 

182 Human-readable name of the connection type of this edge (e.g. 

183 "init input", "output") for use in returned messages. 

184 

185 Returns 

186 ------- 

187 differences : `list` [ `str` ] 

188 List of string messages describing differences between ``self`` and 

189 ``other``. Will be empty if ``self == other`` or if the only 

190 difference is in the task label or connection name (which are not 

191 checked). Messages will use 'A' to refer to ``self`` and 'B' to 

192 refer to ``other``. 

193 """ 

194 result = [] 

195 if self.dataset_type_name != other.dataset_type_name: 

196 result.append( 

197 f"{connection_type.capitalize()} {self.task_label}.{self.connection_name} has dataset type " 

198 f"{self.dataset_type_name!r} in A, but {other.dataset_type_name!r} in B." 

199 ) 

200 if self.storage_class_name != other.storage_class_name: 

201 result.append( 

202 f"{connection_type.capitalize()} {self.task_label}.{self.connection_name} has storage class " 

203 f"{self.storage_class_name!r} in A, but {other.storage_class_name!r} in B." 

204 ) 

205 if self.raw_dimensions != other.raw_dimensions: 

206 result.append( 

207 f"{connection_type.capitalize()} {self.task_label}.{self.connection_name} has raw dimensions " 

208 f"{set(self.raw_dimensions)} in A, but {set(other.raw_dimensions)} in B " 

209 "(differences in raw dimensions may not lead to differences in resolved dimensions, " 

210 "but this cannot be checked without re-resolving the dataset type)." 

211 ) 

212 if self.is_calibration != other.is_calibration: 

213 result.append( 

214 f"{connection_type.capitalize()} {self.task_label}.{self.connection_name} is marked as a " 

215 f"calibration {'in A but not in B' if self.is_calibration else 'in B but not in A'}." 

216 ) 

217 return result 

218 

219 @abstractmethod 

220 def adapt_dataset_type(self, dataset_type: DatasetType) -> DatasetType: 

221 """Transform the graph's definition of a dataset type (parent, with the 

222 registry or producer's storage class) to the one seen by this task. 

223 

224 Parameters 

225 ---------- 

226 dataset_type : `~lsst.daf.butler.DatasetType` 

227 Graph's definition of dataset type. 

228 

229 Returns 

230 ------- 

231 out_dataset_type : `~lsst.daf.butler.DatasetType` 

232 Dataset type seen by this task. 

233 """ 

234 raise NotImplementedError() 

235 

236 @abstractmethod 

237 def adapt_dataset_ref(self, ref: DatasetRef) -> DatasetRef: 

238 """Transform the graph's definition of a dataset reference (parent 

239 dataset type, with the registry or producer's storage class) to the one 

240 seen by this task. 

241 

242 Parameters 

243 ---------- 

244 ref : `~lsst.daf.butler.DatasetRef` 

245 Graph's definition of the dataset reference. 

246 

247 Returns 

248 ------- 

249 out_dataset_ref : `~lsst.daf.butler.DatasetRef` 

250 Dataset reference seen by this task. 

251 """ 

252 raise NotImplementedError() 

253 

254 def _to_xgraph_state(self) -> dict[str, Any]: 

255 """Convert this edges's attributes into a dictionary suitable for use 

256 in exported networkx graphs. 

257 """ 

258 return { 

259 "parent_dataset_type_name": self.parent_dataset_type_name, 

260 "storage_class_name": self.storage_class_name, 

261 "is_init": bool, 

262 } 

263 

264 @classmethod 

265 def _unreduce(cls, kwargs: dict[str, Any]) -> Self: 

266 """Unpickle an `Edge` instance.""" 

267 return cls(**kwargs) 

268 

269 def __reduce__(self) -> tuple[Callable[[dict[str, Any]], Edge], tuple[dict[str, Any]]]: 

270 return ( 

271 self._unreduce, 

272 ( 

273 dict( 

274 task_key=self.task_key, 

275 dataset_type_key=self.dataset_type_key, 

276 storage_class_name=self.storage_class_name, 

277 connection_name=self.connection_name, 

278 is_calibration=self.is_calibration, 

279 raw_dimensions=self.raw_dimensions, 

280 ), 

281 ), 

282 ) 

283 

284 

285class ReadEdge(Edge): 

286 """Representation of an input connection (including init-inputs and 

287 prerequisites) in a pipeline graph. 

288 

289 Parameters 

290 ---------- 

291 dataset_type_key : `NodeKey` 

292 Key for the dataset type node this edge is connected to. This should 

293 hold the parent dataset type name for component dataset types. 

294 task_key : `NodeKey` 

295 Key for the task node this edge is connected to. 

296 storage_class_name : `str` 

297 Name of the dataset type's storage class as seen by the task. 

298 connection_name : `str` 

299 Internal name for the connection as seen by the task. 

300 is_calibration : `bool` 

301 Whether this dataset type can be included in 

302 `~lsst.daf.butler.CollectionType.CALIBRATION` collections. 

303 raw_dimensions : `frozenset` [ `str` ] 

304 Raw dimensions from the connection definition. 

305 is_prerequisite : `bool` 

306 Whether this dataset must be present in the data repository prior to 

307 `QuantumGraph` generation. 

308 component : `str` or `None` 

309 Component of the dataset type requested by the task. 

310 defer_query_constraint : `bool` 

311 If `True`, by default do not include this dataset type's existence as a 

312 constraint on the initial data ID query in QuantumGraph generation. 

313 

314 Notes 

315 ----- 

316 When included in an exported `networkx` graph (e.g. 

317 `PipelineGraph.make_xgraph`), read edges set the following edge attributes: 

318 

319 - ``parent_dataset_type_name`` 

320 - ``storage_class_name`` 

321 - ``is_init`` 

322 - ``component`` 

323 - ``is_prerequisite`` 

324 

325 As with `ReadEdge` instance attributes, these descriptions of dataset types 

326 are those specific to a task, and may differ from the graph's resolved 

327 dataset type or (if `PipelineGraph.resolve` has not been called) there may 

328 not even be a consistent definition of the dataset type. 

329 """ 

330 

331 def __init__( 

332 self, 

333 dataset_type_key: NodeKey, 

334 task_key: NodeKey, 

335 *, 

336 storage_class_name: str, 

337 connection_name: str, 

338 is_calibration: bool, 

339 raw_dimensions: frozenset[str], 

340 is_prerequisite: bool, 

341 component: str | None, 

342 defer_query_constraint: bool, 

343 ): 

344 super().__init__( 

345 task_key=task_key, 

346 dataset_type_key=dataset_type_key, 

347 storage_class_name=storage_class_name, 

348 connection_name=connection_name, 

349 raw_dimensions=raw_dimensions, 

350 is_calibration=is_calibration, 

351 ) 

352 self.is_prerequisite = is_prerequisite 

353 self.component = component 

354 self.defer_query_constraint = defer_query_constraint 

355 

356 component: str | None 

357 """Component to add to `parent_dataset_type_name` to form the dataset type 

358 name seen by this task. 

359 """ 

360 

361 is_prerequisite: bool 

362 """Whether this dataset must be present in the data repository prior to 

363 `QuantumGraph` generation. 

364 """ 

365 

366 defer_query_constraint: bool 

367 """If `True`, by default do not include this dataset type's existence as a 

368 constraint on the initial data ID query in QuantumGraph generation. 

369 """ 

370 

371 @property 

372 def nodes(self) -> tuple[NodeKey, NodeKey]: 

373 # Docstring inherited. 

374 return (self.dataset_type_key, self.task_key) 

375 

376 @property 

377 def dataset_type_name(self) -> str: 

378 """Complete dataset type name, as seen by the task.""" 

379 if self.component is not None: 

380 return f"{self.parent_dataset_type_name}.{self.component}" 

381 return self.parent_dataset_type_name 

382 

383 def diff(self: ReadEdge, other: ReadEdge, connection_type: str = "connection") -> list[str]: 

384 # Docstring inherited. 

385 result = super().diff(other, connection_type) 

386 if self.defer_query_constraint != other.defer_query_constraint: 

387 result.append( 

388 f"{connection_type.capitalize()} {self.connection_name!r} is marked as a deferred query " 

389 f"constraint {'in A but not in B' if self.defer_query_constraint else 'in B but not in A'}." 

390 ) 

391 return result 

392 

393 def adapt_dataset_type(self, dataset_type: DatasetType) -> DatasetType: 

394 # Docstring inherited. 

395 if self.component is not None: 

396 assert ( 

397 self.storage_class_name == dataset_type.storageClass.allComponents()[self.component].name 

398 ), "components with storage class overrides are not supported" 

399 return dataset_type.makeComponentDatasetType(self.component) 

400 if self.storage_class_name != dataset_type.storageClass_name: 

401 return dataset_type.overrideStorageClass(self.storage_class_name) 

402 return dataset_type 

403 

404 def adapt_dataset_ref(self, ref: DatasetRef) -> DatasetRef: 

405 # Docstring inherited. 

406 if self.component is not None: 

407 assert ( 

408 self.storage_class_name == ref.datasetType.storageClass.allComponents()[self.component].name 

409 ), "components with storage class overrides are not supported" 

410 return ref.makeComponentRef(self.component) 

411 if self.storage_class_name != ref.datasetType.storageClass_name: 

412 return ref.overrideStorageClass(self.storage_class_name) 

413 return ref 

414 

415 @classmethod 

416 def _from_connection_map( 

417 cls, 

418 task_key: NodeKey, 

419 connection_name: str, 

420 connection_map: Mapping[str, BaseConnection], 

421 is_prerequisite: bool = False, 

422 ) -> ReadEdge: 

423 """Construct a `ReadEdge` instance from a `.BaseConnection` object. 

424 

425 Parameters 

426 ---------- 

427 task_key : `NodeKey` 

428 Key for the associated task node or task init node. 

429 connection_name : `str` 

430 Internal name for the connection as seen by the task,. 

431 connection_map : Mapping [ `str`, `.BaseConnection` ] 

432 Mapping of post-configuration object to draw dataset type 

433 information from, keyed by connection name. 

434 is_prerequisite : `bool`, optional 

435 Whether this dataset must be present in the data repository prior 

436 to `QuantumGraph` generation. 

437 

438 Returns 

439 ------- 

440 edge : `ReadEdge` 

441 New edge instance. 

442 """ 

443 connection = connection_map[connection_name] 

444 parent_dataset_type_name, component = DatasetType.splitDatasetTypeName(connection.name) 

445 return cls( 

446 dataset_type_key=NodeKey(NodeType.DATASET_TYPE, parent_dataset_type_name), 

447 task_key=task_key, 

448 component=component, 

449 storage_class_name=connection.storageClass, 

450 # InitInput connections don't have .isCalibration. 

451 is_calibration=getattr(connection, "isCalibration", False), 

452 is_prerequisite=is_prerequisite, 

453 connection_name=connection_name, 

454 # InitInput connections don't have a .dimensions because they 

455 # always have empty dimensions. 

456 raw_dimensions=frozenset(getattr(connection, "dimensions", frozenset())), 

457 # PrerequisiteInput and InitInput connections don't have a 

458 # .deferGraphConstraint, because they never constrain the initial 

459 # data ID query. 

460 defer_query_constraint=getattr(connection, "deferGraphConstraint", False), 

461 ) 

462 

463 def _resolve_dataset_type( 

464 self, 

465 *, 

466 current: DatasetType | None, 

467 is_initial_query_constraint: bool, 

468 is_prerequisite: bool | None, 

469 universe: DimensionUniverse, 

470 producer: str | None, 

471 consumers: Sequence[str], 

472 is_registered: bool, 

473 ) -> tuple[DatasetType, bool, bool]: 

474 """Participate in the construction of the `DatasetTypeNode` object 

475 associated with this edge. 

476 

477 Parameters 

478 ---------- 

479 current : `lsst.daf.butler.DatasetType` or `None` 

480 The current graph-wide `DatasetType`, or `None`. This will always 

481 be the registry's definition of the parent dataset type, if one 

482 exists. If not, it will be the dataset type definition from the 

483 task in the graph that writes it, if there is one. If there is no 

484 such task, this will be `None`. 

485 is_initial_query_constraint : `bool` 

486 Whether this dataset type is currently marked as a constraint on 

487 the initial data ID query in QuantumGraph generation. 

488 is_prerequisite : `bool` | None` 

489 Whether this dataset type is marked as a prerequisite input in all 

490 edges processed so far. `None` if this is the first edge. 

491 universe : `lsst.daf.butler.DimensionUniverse` 

492 Object that holds all dimension definitions. 

493 producer : `str` or `None` 

494 The label of the task that produces this dataset type in the 

495 pipeline, or `None` if it is an overall input. 

496 consumers : `Sequence` [ `str` ] 

497 Labels for other consuming tasks that have already participated in 

498 this dataset type's resolution. 

499 is_registered : `bool` 

500 Whether a registration for this dataset type was found in the 

501 data repository. 

502 

503 Returns 

504 ------- 

505 dataset_type : `DatasetType` 

506 The updated graph-wide dataset type. If ``current`` was provided, 

507 this must be equal to it. 

508 is_initial_query_constraint : `bool` 

509 If `True`, this dataset type should be included as a constraint in 

510 the initial data ID query during QuantumGraph generation; this 

511 requires that ``is_initial_query_constraint`` also be `True` on 

512 input. 

513 is_prerequisite : `bool` 

514 Whether this dataset type is marked as a prerequisite input in this 

515 task and all other edges processed so far. 

516 

517 Raises 

518 ------ 

519 MissingDatasetTypeError 

520 Raised if ``current is None`` and this edge cannot define one on 

521 its own. 

522 IncompatibleDatasetTypeError 

523 Raised if ``current is not None`` and this edge's definition is not 

524 compatible with it. 

525 ConnectionTypeConsistencyError 

526 Raised if a prerequisite input for one task appears as a different 

527 kind of connection in any other task. 

528 """ 

529 if "skypix" in self.raw_dimensions: 

530 if current is None: 

531 raise MissingDatasetTypeError( 

532 f"DatasetType '{self.dataset_type_name}' referenced by " 

533 f"{self.task_label!r} uses 'skypix' as a dimension " 

534 f"placeholder, but has not been registered with the data repository. " 

535 f"Note that reference catalog names are now used as the dataset " 

536 f"type name instead of 'ref_cat'." 

537 ) 

538 rest1 = set(universe.conform(self.raw_dimensions - {"skypix"}).names) 

539 rest2 = current.dimensions.names - current.dimensions.skypix.names 

540 if rest1 != rest2: 

541 raise IncompatibleDatasetTypeError( 

542 f"Non-skypix dimensions for dataset type {self.dataset_type_name} declared in " 

543 f"connections ({rest1}) are inconsistent with those in " 

544 f"registry's version of this dataset ({rest2})." 

545 ) 

546 dimensions = current.dimensions.as_group() 

547 else: 

548 dimensions = universe.conform(self.raw_dimensions) 

549 is_initial_query_constraint = is_initial_query_constraint and not self.defer_query_constraint 

550 if is_prerequisite is None: 

551 is_prerequisite = self.is_prerequisite 

552 elif is_prerequisite and not self.is_prerequisite: 

553 raise ConnectionTypeConsistencyError( 

554 f"Dataset type {self.parent_dataset_type_name!r} is a prerequisite input to {consumers}, " 

555 f"but it is not a prerequisite to {self.task_label!r}." 

556 ) 

557 elif not is_prerequisite and self.is_prerequisite: 

558 if producer is not None: 

559 raise ConnectionTypeConsistencyError( 

560 f"Dataset type {self.parent_dataset_type_name!r} is a prerequisite input to " 

561 f"{self.task_label}, but it is produced by {producer!r}." 

562 ) 

563 else: 

564 raise ConnectionTypeConsistencyError( 

565 f"Dataset type {self.parent_dataset_type_name!r} is a prerequisite input to " 

566 f"{self.task_label}, but it is a regular input to {consumers!r}." 

567 ) 

568 

569 def report_current_origin() -> str: 

570 if is_registered: 

571 return "data repository" 

572 elif producer is not None: 

573 return f"producing task {producer!r}" 

574 else: 

575 return f"consuming task(s) {consumers!r}" 

576 

577 if self.component is not None: 

578 if current is None: 

579 raise MissingDatasetTypeError( 

580 f"Dataset type {self.parent_dataset_type_name!r} is not registered and not produced by " 

581 f"this pipeline, but it used by task {self.task_label!r}, via component " 

582 f"{self.component!r}. This pipeline cannot be resolved until the parent dataset type is " 

583 "registered." 

584 ) 

585 all_current_components = current.storageClass.allComponents() 

586 if self.component not in all_current_components: 

587 raise IncompatibleDatasetTypeError( 

588 f"Dataset type {self.parent_dataset_type_name!r} has storage class " 

589 f"{current.storageClass_name!r} (from {report_current_origin()}), " 

590 f"which does not include component {self.component!r} " 

591 f"as requested by task {self.task_label!r}." 

592 ) 

593 if all_current_components[self.component].name != self.storage_class_name: 

594 raise IncompatibleDatasetTypeError( 

595 f"Dataset type '{self.parent_dataset_type_name}.{self.component}' has storage class " 

596 f"{all_current_components[self.component].name!r} " 

597 f"(from {report_current_origin()}), which does not match " 

598 f"{self.storage_class_name!r}, as requested by task {self.task_label!r}. " 

599 "Note that storage class conversions of components are not supported." 

600 ) 

601 return current, is_initial_query_constraint, is_prerequisite 

602 else: 

603 dataset_type = DatasetType( 

604 self.parent_dataset_type_name, 

605 dimensions, 

606 storageClass=self.storage_class_name, 

607 isCalibration=self.is_calibration, 

608 ) 

609 if current is not None: 

610 if not is_registered and producer is None: 

611 # Current definition comes from another consumer; we 

612 # require the dataset types to be exactly equal (not just 

613 # compatible), since neither connection should take 

614 # precedence. 

615 if dataset_type != current: 

616 raise MissingDatasetTypeError( 

617 f"Definitions differ for input dataset type {self.parent_dataset_type_name!r}; " 

618 f"task {self.task_label!r} has {dataset_type}, but the definition " 

619 f"from {report_current_origin()} is {current}. If the storage classes are " 

620 "compatible but different, registering the dataset type in the data repository " 

621 "in advance will avoid this error." 

622 ) 

623 elif not dataset_type.is_compatible_with(current): 

624 raise IncompatibleDatasetTypeError( 

625 f"Incompatible definition for input dataset type {self.parent_dataset_type_name!r}; " 

626 f"task {self.task_label!r} has {dataset_type}, but the definition " 

627 f"from {report_current_origin()} is {current}." 

628 ) 

629 return current, is_initial_query_constraint, is_prerequisite 

630 else: 

631 return dataset_type, is_initial_query_constraint, is_prerequisite 

632 

633 def _to_xgraph_state(self) -> dict[str, Any]: 

634 # Docstring inherited. 

635 result = super()._to_xgraph_state() 

636 result["component"] = self.component 

637 result["is_prerequisite"] = self.is_prerequisite 

638 return result 

639 

640 def __reduce__(self) -> tuple[Callable[[dict[str, Any]], Edge], tuple[dict[str, Any]]]: 

641 return ( 

642 self._unreduce, 

643 ( 

644 dict( 

645 dataset_type_key=self.dataset_type_key, 

646 task_key=self.task_key, 

647 storage_class_name=self.storage_class_name, 

648 connection_name=self.connection_name, 

649 is_calibration=self.is_calibration, 

650 raw_dimensions=self.raw_dimensions, 

651 is_prerequisite=self.is_prerequisite, 

652 component=self.component, 

653 defer_query_constraint=self.defer_query_constraint, 

654 ), 

655 ), 

656 ) 

657 

658 

659class WriteEdge(Edge): 

660 """Representation of an output connection (including init-outputs) in a 

661 pipeline graph. 

662 

663 Notes 

664 ----- 

665 When included in an exported `networkx` graph (e.g. 

666 `PipelineGraph.make_xgraph`), write edges set the following edge 

667 attributes: 

668 

669 - ``parent_dataset_type_name`` 

670 - ``storage_class_name`` 

671 - ``is_init`` 

672 

673 As with `WRiteEdge` instance attributes, these descriptions of dataset 

674 types are those specific to a task, and may differ from the graph's 

675 resolved dataset type or (if `PipelineGraph.resolve` has not been called) 

676 there may not even be a consistent definition of the dataset type. 

677 """ 

678 

679 @property 

680 def nodes(self) -> tuple[NodeKey, NodeKey]: 

681 # Docstring inherited. 

682 return (self.task_key, self.dataset_type_key) 

683 

684 def adapt_dataset_type(self, dataset_type: DatasetType) -> DatasetType: 

685 # Docstring inherited. 

686 if self.storage_class_name != dataset_type.storageClass_name: 

687 return dataset_type.overrideStorageClass(self.storage_class_name) 

688 return dataset_type 

689 

690 def adapt_dataset_ref(self, ref: DatasetRef) -> DatasetRef: 

691 # Docstring inherited. 

692 if self.storage_class_name != ref.datasetType.storageClass_name: 

693 return ref.overrideStorageClass(self.storage_class_name) 

694 return ref 

695 

696 @classmethod 

697 def _from_connection_map( 

698 cls, 

699 task_key: NodeKey, 

700 connection_name: str, 

701 connection_map: Mapping[str, BaseConnection], 

702 ) -> WriteEdge: 

703 """Construct a `WriteEdge` instance from a `.BaseConnection` object. 

704 

705 Parameters 

706 ---------- 

707 task_key : `NodeKey` 

708 Key for the associated task node or task init node. 

709 connection_name : `str` 

710 Internal name for the connection as seen by the task,. 

711 connection_map : Mapping [ `str`, `.BaseConnection` ] 

712 Mapping of post-configuration object to draw dataset type 

713 information from, keyed by connection name. 

714 

715 Returns 

716 ------- 

717 edge : `WriteEdge` 

718 New edge instance. 

719 """ 

720 connection = connection_map[connection_name] 

721 parent_dataset_type_name, component = DatasetType.splitDatasetTypeName(connection.name) 

722 if component is not None: 

723 raise ValueError( 

724 f"Illegal output component dataset {connection.name!r} in task {task_key.name!r}." 

725 ) 

726 return cls( 

727 task_key=task_key, 

728 dataset_type_key=NodeKey(NodeType.DATASET_TYPE, parent_dataset_type_name), 

729 storage_class_name=connection.storageClass, 

730 connection_name=connection_name, 

731 # InitOutput connections don't have .isCalibration. 

732 is_calibration=getattr(connection, "isCalibration", False), 

733 # InitOutput connections don't have a .dimensions because they 

734 # always have empty dimensions. 

735 raw_dimensions=frozenset(getattr(connection, "dimensions", frozenset())), 

736 ) 

737 

738 def _resolve_dataset_type(self, current: DatasetType | None, universe: DimensionUniverse) -> DatasetType: 

739 """Participate in the construction of the `DatasetTypeNode` object 

740 associated with this edge. 

741 

742 Parameters 

743 ---------- 

744 current : `lsst.daf.butler.DatasetType` or `None` 

745 The current graph-wide `DatasetType`, or `None`. This will always 

746 be the registry's definition of the parent dataset type, if one 

747 exists. 

748 universe : `lsst.daf.butler.DimensionUniverse` 

749 Object that holds all dimension definitions. 

750 

751 Returns 

752 ------- 

753 dataset_type : `DatasetType` 

754 A dataset type compatible with this edge. If ``current`` was 

755 provided, this must be equal to it. 

756 

757 Raises 

758 ------ 

759 IncompatibleDatasetTypeError 

760 Raised if ``current is not None`` and this edge's definition is not 

761 compatible with it. 

762 """ 

763 dimensions = universe.conform(self.raw_dimensions) 

764 dataset_type = DatasetType( 

765 self.parent_dataset_type_name, 

766 dimensions, 

767 storageClass=self.storage_class_name, 

768 isCalibration=self.is_calibration, 

769 ) 

770 if current is not None: 

771 if not current.is_compatible_with(dataset_type): 

772 raise IncompatibleDatasetTypeError( 

773 f"Incompatible definition for output dataset type {self.parent_dataset_type_name!r}: " 

774 f"task {self.task_label!r} has {dataset_type}, but data repository has {current}." 

775 ) 

776 return current 

777 else: 

778 return dataset_type