Coverage for python/lsst/pipe/base/pipeline.py: 21%

435 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-06 02:51 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import re 

31import urllib.parse 

32 

33# ------------------------------- 

34# Imports of standard modules -- 

35# ------------------------------- 

36from dataclasses import dataclass 

37from types import MappingProxyType 

38from typing import ( 

39 TYPE_CHECKING, 

40 AbstractSet, 

41 Callable, 

42 ClassVar, 

43 Dict, 

44 Generator, 

45 Iterable, 

46 Iterator, 

47 Mapping, 

48 Optional, 

49 Set, 

50 Tuple, 

51 Type, 

52 Union, 

53 cast, 

54) 

55 

56# ----------------------------- 

57# Imports for other modules -- 

58from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_full_type_name 

62 

63from . import automatic_connection_constants as acc 

64from . import pipelineIR, pipeTools 

65from ._instrument import Instrument as PipeBaseInstrument 

66from ._task_metadata import TaskMetadata 

67from .config import PipelineTaskConfig 

68from .connections import iterConnections 

69from .connectionTypes import Input 

70from .pipelineTask import PipelineTask 

71from .task import _TASK_METADATA_TYPE 

72 

73if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true

74 from lsst.obs.base import Instrument 

75 from lsst.pex.config import Config 

76 

77# ---------------------------------- 

78# Local non-exported definitions -- 

79# ---------------------------------- 

80 

81_LOG = logging.getLogger(__name__) 

82 

83# ------------------------ 

84# Exported definitions -- 

85# ------------------------ 

86 

87 

88@dataclass 

89class LabelSpecifier: 

90 """A structure to specify a subset of labels to load 

91 

92 This structure may contain a set of labels to be used in subsetting a 

93 pipeline, or a beginning and end point. Beginning or end may be empty, 

94 in which case the range will be a half open interval. Unlike python 

95 iteration bounds, end bounds are *INCLUDED*. Note that range based 

96 selection is not well defined for pipelines that are not linear in nature, 

97 and correct behavior is not guaranteed, or may vary from run to run. 

98 """ 

99 

100 labels: Optional[Set[str]] = None 

101 begin: Optional[str] = None 

102 end: Optional[str] = None 

103 

104 def __post_init__(self) -> None: 

105 if self.labels is not None and (self.begin or self.end): 

106 raise ValueError( 

107 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

108 ) 

109 

110 

111class TaskDef: 

112 """TaskDef is a collection of information about task needed by Pipeline. 

113 

114 The information includes task name, configuration object and optional 

115 task class. This class is just a collection of attributes and it exposes 

116 all of them so that attributes could potentially be modified in place 

117 (e.g. if configuration needs extra overrides). 

118 

119 Attributes 

120 ---------- 

121 taskName : `str`, optional 

122 The fully-qualified `PipelineTask` class name. If not provided, 

123 ``taskClass`` must be. 

124 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional 

125 Instance of the configuration class corresponding to this task class, 

126 usually with all overrides applied. This config will be frozen. If 

127 not provided, ``taskClass`` must be provided and 

128 ``taskClass.ConfigClass()`` will be used. 

129 taskClass : `type`, optional 

130 `PipelineTask` class object; if provided and ``taskName`` is as well, 

131 the caller guarantees that they are consistent. If not provided, 

132 ``taskName`` is used to import the type. 

133 label : `str`, optional 

134 Task label, usually a short string unique in a pipeline. If not 

135 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

136 be used. 

137 """ 

138 

139 def __init__( 

140 self, 

141 taskName: Optional[str] = None, 

142 config: Optional[PipelineTaskConfig] = None, 

143 taskClass: Optional[Type[PipelineTask]] = None, 

144 label: Optional[str] = None, 

145 ): 

146 if taskName is None: 

147 if taskClass is None: 

148 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

149 taskName = get_full_type_name(taskClass) 

150 elif taskClass is None: 

151 taskClass = doImportType(taskName) 

152 if config is None: 

153 if taskClass is None: 

154 raise ValueError("`taskClass` must be provided if `config` is not.") 

155 config = taskClass.ConfigClass() 

156 if label is None: 

157 if taskClass is None: 

158 raise ValueError("`taskClass` must be provided if `label` is not.") 

159 label = taskClass._DefaultName 

160 self.taskName = taskName 

161 try: 

162 config.validate() 

163 except Exception: 

164 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

165 raise 

166 config.freeze() 

167 self.config = config 

168 self.taskClass = taskClass 

169 self.label = label 

170 self.connections = config.connections.ConnectionsClass(config=config) 

171 

172 @property 

173 def configDatasetName(self) -> str: 

174 """Name of a dataset type for configuration of this task (`str`)""" 

175 return acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.label) 

176 

177 @property 

178 def metadataDatasetName(self) -> str: 

179 """Name of a dataset type for metadata of this task (`str`)""" 

180 return self.makeMetadataDatasetName(self.label) 

181 

182 @classmethod 

183 def makeMetadataDatasetName(cls, label: str) -> str: 

184 """Construct the name of the dataset type for metadata for a task. 

185 

186 Parameters 

187 ---------- 

188 label : `str` 

189 Label for the task within its pipeline. 

190 

191 Returns 

192 ------- 

193 name : `str` 

194 Name of the task's metadata dataset type. 

195 """ 

196 return acc.METADATA_OUTPUT_TEMPLATE.format(label=label) 

197 

198 @property 

199 def logOutputDatasetName(self) -> Optional[str]: 

200 """Name of a dataset type for log output from this task, `None` if 

201 logs are not to be saved (`str`) 

202 """ 

203 if cast(PipelineTaskConfig, self.config).saveLogOutput: 

204 return acc.LOG_OUTPUT_TEMPLATE.format(label=self.label) 

205 else: 

206 return None 

207 

208 def __str__(self) -> str: 

209 rep = "TaskDef(" + self.taskName 

210 if self.label: 

211 rep += ", label=" + self.label 

212 rep += ")" 

213 return rep 

214 

215 def __eq__(self, other: object) -> bool: 

216 if not isinstance(other, TaskDef): 

217 return False 

218 # This does not consider equality of configs when determining equality 

219 # as config equality is a difficult thing to define. Should be updated 

220 # after DM-27847 

221 return self.taskClass == other.taskClass and self.label == other.label 

222 

223 def __hash__(self) -> int: 

224 return hash((self.taskClass, self.label)) 

225 

226 @classmethod 

227 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef: 

228 """Custom callable for unpickling. 

229 

230 All arguments are forwarded directly to the constructor; this 

231 trampoline is only needed because ``__reduce__`` callables can't be 

232 called with keyword arguments. 

233 """ 

234 return cls(taskName=taskName, config=config, label=label) 

235 

236 def __reduce__(self) -> Tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], Tuple[str, Config, str]]: 

237 return (self._unreduce, (self.taskName, self.config, self.label)) 

238 

239 

240class Pipeline: 

241 """A `Pipeline` is a representation of a series of tasks to run, and the 

242 configuration for those tasks. 

243 

244 Parameters 

245 ---------- 

246 description : `str` 

247 A description of that this pipeline does. 

248 """ 

249 

250 def __init__(self, description: str): 

251 pipeline_dict = {"description": description, "tasks": {}} 

252 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

253 

254 @classmethod 

255 def fromFile(cls, filename: str) -> Pipeline: 

256 """Load a pipeline defined in a pipeline yaml file. 

257 

258 Parameters 

259 ---------- 

260 filename: `str` 

261 A path that points to a pipeline defined in yaml format. This 

262 filename may also supply additional labels to be used in 

263 subsetting the loaded Pipeline. These labels are separated from 

264 the path by a \\#, and may be specified as a comma separated 

265 list, or a range denoted as beginning..end. Beginning or end may 

266 be empty, in which case the range will be a half open interval. 

267 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

268 that range based selection is not well defined for pipelines that 

269 are not linear in nature, and correct behavior is not guaranteed, 

270 or may vary from run to run. 

271 

272 Returns 

273 ------- 

274 pipeline: `Pipeline` 

275 The pipeline loaded from specified location with appropriate (if 

276 any) subsetting 

277 

278 Notes 

279 ----- 

280 This method attempts to prune any contracts that contain labels which 

281 are not in the declared subset of labels. This pruning is done using a 

282 string based matching due to the nature of contracts and may prune more 

283 than it should. 

284 """ 

285 return cls.from_uri(filename) 

286 

287 @classmethod 

288 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

289 """Load a pipeline defined in a pipeline yaml file at a location 

290 specified by a URI. 

291 

292 Parameters 

293 ---------- 

294 uri : convertible to `ResourcePath` 

295 If a string is supplied this should be a URI path that points to a 

296 pipeline defined in yaml format, either as a direct path to the 

297 yaml file, or as a directory containing a "pipeline.yaml" file (the 

298 form used by `write_to_uri` with ``expand=True``). This uri may 

299 also supply additional labels to be used in subsetting the loaded 

300 Pipeline. These labels are separated from the path by a \\#, and 

301 may be specified as a comma separated list, or a range denoted as 

302 beginning..end. Beginning or end may be empty, in which case the 

303 range will be a half open interval. Unlike python iteration bounds, 

304 end bounds are *INCLUDED*. Note that range based selection is not 

305 well defined for pipelines that are not linear in nature, and 

306 correct behavior is not guaranteed, or may vary from run to run. 

307 The same specifiers can be used with a `ResourcePath` object, by 

308 being the sole contents in the fragments attribute. 

309 

310 Returns 

311 ------- 

312 pipeline : `Pipeline` 

313 The pipeline loaded from specified location with appropriate (if 

314 any) subsetting 

315 

316 Notes 

317 ----- 

318 This method attempts to prune any contracts that contain labels which 

319 are not in the declared subset of labels. This pruning is done using a 

320 string based matching due to the nature of contracts and may prune more 

321 than it should. 

322 """ 

323 # Split up the uri and any labels that were supplied 

324 uri, label_specifier = cls._parse_file_specifier(uri) 

325 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

326 

327 # If there are labels supplied, only keep those 

328 if label_specifier is not None: 

329 pipeline = pipeline.subsetFromLabels(label_specifier) 

330 return pipeline 

331 

332 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

333 """Subset a pipeline to contain only labels specified in labelSpecifier 

334 

335 Parameters 

336 ---------- 

337 labelSpecifier : `labelSpecifier` 

338 Object containing labels that describes how to subset a pipeline. 

339 

340 Returns 

341 ------- 

342 pipeline : `Pipeline` 

343 A new pipeline object that is a subset of the old pipeline 

344 

345 Raises 

346 ------ 

347 ValueError 

348 Raised if there is an issue with specified labels 

349 

350 Notes 

351 ----- 

352 This method attempts to prune any contracts that contain labels which 

353 are not in the declared subset of labels. This pruning is done using a 

354 string based matching due to the nature of contracts and may prune more 

355 than it should. 

356 """ 

357 # Labels supplied as a set 

358 if labelSpecifier.labels: 

359 labelSet = labelSpecifier.labels 

360 # Labels supplied as a range, first create a list of all the labels 

361 # in the pipeline sorted according to task dependency. Then only 

362 # keep labels that lie between the supplied bounds 

363 else: 

364 # Create a copy of the pipeline to use when assessing the label 

365 # ordering. Use a dict for fast searching while preserving order. 

366 # Remove contracts so they do not fail in the expansion step. This 

367 # is needed because a user may only configure the tasks they intend 

368 # to run, which may cause some contracts to fail if they will later 

369 # be dropped 

370 pipeline = copy.deepcopy(self) 

371 pipeline._pipelineIR.contracts = [] 

372 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

373 

374 # Verify the bounds are in the labels 

375 if labelSpecifier.begin is not None: 

376 if labelSpecifier.begin not in labels: 

377 raise ValueError( 

378 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition" 

379 ) 

380 if labelSpecifier.end is not None: 

381 if labelSpecifier.end not in labels: 

382 raise ValueError( 

383 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

384 ) 

385 

386 labelSet = set() 

387 for label in labels: 

388 if labelSpecifier.begin is not None: 

389 if label != labelSpecifier.begin: 

390 continue 

391 else: 

392 labelSpecifier.begin = None 

393 labelSet.add(label) 

394 if labelSpecifier.end is not None and label == labelSpecifier.end: 

395 break 

396 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

397 

398 @staticmethod 

399 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]: 

400 """Split appart a uri and any possible label subsets""" 

401 if isinstance(uri, str): 

402 # This is to support legacy pipelines during transition 

403 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

404 if num_replace: 

405 raise ValueError( 

406 f"The pipeline file {uri} seems to use the legacy :" 

407 " to separate labels, please use # instead." 

408 ) 

409 if uri.count("#") > 1: 

410 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

411 # Everything else can be converted directly to ResourcePath. 

412 uri = ResourcePath(uri) 

413 label_subset = uri.fragment or None 

414 

415 specifier: Optional[LabelSpecifier] 

416 if label_subset is not None: 

417 label_subset = urllib.parse.unquote(label_subset) 

418 args: Dict[str, Union[Set[str], str, None]] 

419 # labels supplied as a list 

420 if "," in label_subset: 

421 if ".." in label_subset: 

422 raise ValueError( 

423 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

424 ) 

425 args = {"labels": set(label_subset.split(","))} 

426 # labels supplied as a range 

427 elif ".." in label_subset: 

428 # Try to de-structure the labelSubset, this will fail if more 

429 # than one range is specified 

430 begin, end, *rest = label_subset.split("..") 

431 if rest: 

432 raise ValueError("Only one range can be specified when loading a pipeline") 

433 args = {"begin": begin if begin else None, "end": end if end else None} 

434 # Assume anything else is a single label 

435 else: 

436 args = {"labels": {label_subset}} 

437 

438 # MyPy doesn't like how cavalier kwarg construction is with types. 

439 specifier = LabelSpecifier(**args) # type: ignore 

440 else: 

441 specifier = None 

442 

443 return uri, specifier 

444 

445 @classmethod 

446 def fromString(cls, pipeline_string: str) -> Pipeline: 

447 """Create a pipeline from string formatted as a pipeline document. 

448 

449 Parameters 

450 ---------- 

451 pipeline_string : `str` 

452 A string that is formatted according like a pipeline document 

453 

454 Returns 

455 ------- 

456 pipeline: `Pipeline` 

457 """ 

458 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

459 return pipeline 

460 

461 @classmethod 

462 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

463 """Create a pipeline from an already created `PipelineIR` object. 

464 

465 Parameters 

466 ---------- 

467 deserialized_pipeline: `PipelineIR` 

468 An already created pipeline intermediate representation object 

469 

470 Returns 

471 ------- 

472 pipeline: `Pipeline` 

473 """ 

474 pipeline = cls.__new__(cls) 

475 pipeline._pipelineIR = deserialized_pipeline 

476 return pipeline 

477 

478 @classmethod 

479 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

480 """Create a new pipeline by copying an already existing `Pipeline`. 

481 

482 Parameters 

483 ---------- 

484 pipeline: `Pipeline` 

485 An already created pipeline intermediate representation object 

486 

487 Returns 

488 ------- 

489 pipeline: `Pipeline` 

490 """ 

491 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

492 

493 def __str__(self) -> str: 

494 return str(self._pipelineIR) 

495 

496 def mergePipeline(self, pipeline: Pipeline) -> None: 

497 """Merge another in-memory `Pipeline` object into this one. 

498 

499 This merges another pipeline into this object, as if it were declared 

500 in the import block of the yaml definition of this pipeline. This 

501 modifies this pipeline in place. 

502 

503 Parameters 

504 ---------- 

505 pipeline : `Pipeline` 

506 The `Pipeline` object that is to be merged into this object. 

507 """ 

508 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,)) 

509 

510 def addLabelToSubset(self, subset: str, label: str) -> None: 

511 """Add a task label from the specified subset. 

512 

513 Parameters 

514 ---------- 

515 subset : `str` 

516 The labeled subset to modify 

517 label : `str` 

518 The task label to add to the specified subset. 

519 

520 Raises 

521 ------ 

522 ValueError 

523 Raised if the specified subset does not exist within the pipeline. 

524 Raised if the specified label does not exist within the pipeline. 

525 """ 

526 if label not in self._pipelineIR.tasks: 

527 raise ValueError(f"Label {label} does not appear within the pipeline") 

528 if subset not in self._pipelineIR.labeled_subsets: 

529 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

530 self._pipelineIR.labeled_subsets[subset].subset.add(label) 

531 

532 def removeLabelFromSubset(self, subset: str, label: str) -> None: 

533 """Remove a task label from the specified subset. 

534 

535 Parameters 

536 ---------- 

537 subset : `str` 

538 The labeled subset to modify 

539 label : `str` 

540 The task label to remove from the specified subset. 

541 

542 Raises 

543 ------ 

544 ValueError 

545 Raised if the specified subset does not exist in the pipeline. 

546 Raised if the specified label does not exist within the specified 

547 subset. 

548 """ 

549 if subset not in self._pipelineIR.labeled_subsets: 

550 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

551 if label not in self._pipelineIR.labeled_subsets[subset].subset: 

552 raise ValueError(f"Label {label} does not appear within the pipeline") 

553 self._pipelineIR.labeled_subsets[subset].subset.remove(label) 

554 

555 def findSubsetsWithLabel(self, label: str) -> set[str]: 

556 """Find any subsets which may contain the specified label. 

557 

558 This function returns the name of subsets which return the specified 

559 label. May return an empty set if there are no subsets, or no subsets 

560 containing the specified label. 

561 

562 Parameters 

563 ---------- 

564 label : `str` 

565 The task label to use in membership check 

566 

567 Returns 

568 ------- 

569 subsets : `set` of `str` 

570 Returns a set (possibly empty) of subsets names which contain the 

571 specified label. 

572 

573 Raises 

574 ------ 

575 ValueError 

576 Raised if the specified label does not exist within this pipeline. 

577 """ 

578 results = set() 

579 if label not in self._pipelineIR.tasks: 

580 raise ValueError(f"Label {label} does not appear within the pipeline") 

581 for subset in self._pipelineIR.labeled_subsets.values(): 

582 if label in subset.subset: 

583 results.add(subset.label) 

584 return results 

585 

586 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

587 """Add an instrument to the pipeline, or replace an instrument that is 

588 already defined. 

589 

590 Parameters 

591 ---------- 

592 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

593 Either a derived class object of a `lsst.daf.butler.instrument` or 

594 a string corresponding to a fully qualified 

595 `lsst.daf.butler.instrument` name. 

596 """ 

597 if isinstance(instrument, str): 

598 pass 

599 else: 

600 # TODO: assume that this is a subclass of Instrument, no type 

601 # checking 

602 instrument = get_full_type_name(instrument) 

603 self._pipelineIR.instrument = instrument 

604 

605 def getInstrument(self) -> Optional[str]: 

606 """Get the instrument from the pipeline. 

607 

608 Returns 

609 ------- 

610 instrument : `str`, or None 

611 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

612 name, or None if the pipeline does not have an instrument. 

613 """ 

614 return self._pipelineIR.instrument 

615 

616 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None: 

617 """Add a new task to the pipeline, or replace a task that is already 

618 associated with the supplied label. 

619 

620 Parameters 

621 ---------- 

622 task: `PipelineTask` or `str` 

623 Either a derived class object of a `PipelineTask` or a string 

624 corresponding to a fully qualified `PipelineTask` name. 

625 label: `str` 

626 A label that is used to identify the `PipelineTask` being added 

627 """ 

628 if isinstance(task, str): 

629 taskName = task 

630 elif issubclass(task, PipelineTask): 

631 taskName = get_full_type_name(task) 

632 else: 

633 raise ValueError( 

634 "task must be either a child class of PipelineTask or a string containing" 

635 " a fully qualified name to one" 

636 ) 

637 if not label: 

638 # in some cases (with command line-generated pipeline) tasks can 

639 # be defined without label which is not acceptable, use task 

640 # _DefaultName in that case 

641 if isinstance(task, str): 

642 task_class = doImportType(task) 

643 label = task_class._DefaultName 

644 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

645 

646 def removeTask(self, label: str) -> None: 

647 """Remove a task from the pipeline. 

648 

649 Parameters 

650 ---------- 

651 label : `str` 

652 The label used to identify the task that is to be removed 

653 

654 Raises 

655 ------ 

656 KeyError 

657 If no task with that label exists in the pipeline 

658 

659 """ 

660 self._pipelineIR.tasks.pop(label) 

661 

662 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

663 """Apply single config override. 

664 

665 Parameters 

666 ---------- 

667 label : `str` 

668 Label of the task. 

669 key: `str` 

670 Fully-qualified field name. 

671 value : object 

672 Value to be given to a field. 

673 """ 

674 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

675 

676 def addConfigFile(self, label: str, filename: str) -> None: 

677 """Add overrides from a specified file. 

678 

679 Parameters 

680 ---------- 

681 label : `str` 

682 The label used to identify the task associated with config to 

683 modify 

684 filename : `str` 

685 Path to the override file. 

686 """ 

687 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

688 

689 def addConfigPython(self, label: str, pythonString: str) -> None: 

690 """Add Overrides by running a snippet of python code against a config. 

691 

692 Parameters 

693 ---------- 

694 label : `str` 

695 The label used to identity the task associated with config to 

696 modify. 

697 pythonString: `str` 

698 A string which is valid python code to be executed. This is done 

699 with config as the only local accessible value. 

700 """ 

701 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

702 

703 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

704 if label == "parameters": 

705 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

706 if newConfig.file: 

707 raise ValueError("Setting parameters section with config file is not supported") 

708 if newConfig.python: 

709 raise ValueError("Setting parameters section using python block in unsupported") 

710 return 

711 if label not in self._pipelineIR.tasks: 

712 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

713 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

714 

715 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

716 """Write the pipeline to a file or directory. 

717 

718 Parameters 

719 ---------- 

720 uri : convertible to `ResourcePath` 

721 URI to write to; may have any scheme with `ResourcePath` write 

722 support or no scheme for a local file/directory. Should have a 

723 ``.yaml``. 

724 """ 

725 self._pipelineIR.write_to_uri(uri) 

726 

727 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

728 """Returns a generator of TaskDefs which can be used to create quantum 

729 graphs. 

730 

731 Returns 

732 ------- 

733 generator : generator of `TaskDef` 

734 The generator returned will be the sorted iterator of tasks which 

735 are to be used in constructing a quantum graph. 

736 

737 Raises 

738 ------ 

739 NotImplementedError 

740 If a dataId is supplied in a config block. This is in place for 

741 future use 

742 """ 

743 taskDefs = [] 

744 for label in self._pipelineIR.tasks: 

745 taskDefs.append(self._buildTaskDef(label)) 

746 

747 # lets evaluate the contracts 

748 if self._pipelineIR.contracts is not None: 

749 label_to_config = {x.label: x.config for x in taskDefs} 

750 for contract in self._pipelineIR.contracts: 

751 # execute this in its own line so it can raise a good error 

752 # message if there was problems with the eval 

753 success = eval(contract.contract, None, label_to_config) 

754 if not success: 

755 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

756 raise pipelineIR.ContractError( 

757 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

758 ) 

759 

760 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

761 yield from pipeTools.orderPipeline(taskDefs) 

762 

763 def _buildTaskDef(self, label: str) -> TaskDef: 

764 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

765 raise NameError(f"Label {label} does not appear in this pipeline") 

766 taskClass: Type[PipelineTask] = doImportType(taskIR.klass) 

767 taskName = get_full_type_name(taskClass) 

768 config = taskClass.ConfigClass() 

769 instrument: PipeBaseInstrument | None = None 

770 if (instrumentName := self._pipelineIR.instrument) is not None: 

771 instrument_cls: type = doImportType(instrumentName) 

772 instrument = instrument_cls() 

773 config.applyConfigOverrides( 

774 instrument, 

775 getattr(taskClass, "_DefaultName", ""), 

776 taskIR.config, 

777 self._pipelineIR.parameters, 

778 label, 

779 ) 

780 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

781 

782 def __iter__(self) -> Generator[TaskDef, None, None]: 

783 return self.toExpandedPipeline() 

784 

785 def __getitem__(self, item: str) -> TaskDef: 

786 return self._buildTaskDef(item) 

787 

788 def __len__(self) -> int: 

789 return len(self._pipelineIR.tasks) 

790 

791 def __eq__(self, other: object) -> bool: 

792 if not isinstance(other, Pipeline): 

793 return False 

794 elif self._pipelineIR == other._pipelineIR: 

795 # Shortcut: if the IR is the same, the expanded pipeline must be 

796 # the same as well. But the converse is not true. 

797 return True 

798 else: 

799 self_expanded = {td.label: (td.taskClass,) for td in self} 

800 other_expanded = {td.label: (td.taskClass,) for td in other} 

801 if self_expanded != other_expanded: 

802 return False 

803 # After DM-27847, we should compare configuration here, or better, 

804 # delegated to TaskDef.__eq__ after making that compare configurations. 

805 raise NotImplementedError( 

806 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847." 

807 ) 

808 

809 

810@dataclass(frozen=True) 

811class TaskDatasetTypes: 

812 """An immutable struct that extracts and classifies the dataset types used 

813 by a `PipelineTask` 

814 """ 

815 

816 initInputs: NamedValueSet[DatasetType] 

817 """Dataset types that are needed as inputs in order to construct this Task. 

818 

819 Task-level `initInputs` may be classified as either 

820 `~PipelineDatasetTypes.initInputs` or 

821 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

822 """ 

823 

824 initOutputs: NamedValueSet[DatasetType] 

825 """Dataset types that may be written after constructing this Task. 

826 

827 Task-level `initOutputs` may be classified as either 

828 `~PipelineDatasetTypes.initOutputs` or 

829 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

830 """ 

831 

832 inputs: NamedValueSet[DatasetType] 

833 """Dataset types that are regular inputs to this Task. 

834 

835 If an input dataset needed for a Quantum cannot be found in the input 

836 collection(s) or produced by another Task in the Pipeline, that Quantum 

837 (and all dependent Quanta) will not be produced. 

838 

839 Task-level `inputs` may be classified as either 

840 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

841 at the Pipeline level. 

842 """ 

843 

844 queryConstraints: NamedValueSet[DatasetType] 

845 """Regular inputs that should not be used as constraints on the initial 

846 QuantumGraph generation data ID query, according to their tasks 

847 (`NamedValueSet`). 

848 """ 

849 

850 prerequisites: NamedValueSet[DatasetType] 

851 """Dataset types that are prerequisite inputs to this Task. 

852 

853 Prerequisite inputs must exist in the input collection(s) before the 

854 pipeline is run, but do not constrain the graph - if a prerequisite is 

855 missing for a Quantum, `PrerequisiteMissingError` is raised. 

856 

857 Prerequisite inputs are not resolved until the second stage of 

858 QuantumGraph generation. 

859 """ 

860 

861 outputs: NamedValueSet[DatasetType] 

862 """Dataset types that are produced by this Task. 

863 

864 Task-level `outputs` may be classified as either 

865 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

866 at the Pipeline level. 

867 """ 

868 

869 @classmethod 

870 def fromTaskDef( 

871 cls, 

872 taskDef: TaskDef, 

873 *, 

874 registry: Registry, 

875 include_configs: bool = True, 

876 storage_class_mapping: Optional[Mapping[str, str]] = None, 

877 ) -> TaskDatasetTypes: 

878 """Extract and classify the dataset types from a single `PipelineTask`. 

879 

880 Parameters 

881 ---------- 

882 taskDef: `TaskDef` 

883 An instance of a `TaskDef` class for a particular `PipelineTask`. 

884 registry: `Registry` 

885 Registry used to construct normalized `DatasetType` objects and 

886 retrieve those that are incomplete. 

887 include_configs : `bool`, optional 

888 If `True` (default) include config dataset types as 

889 ``initOutputs``. 

890 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

891 If a taskdef contains a component dataset type that is unknown 

892 to the registry, its parent StorageClass will be looked up in this 

893 mapping if it is supplied. If the mapping does not contain the 

894 composite dataset type, or the mapping is not supplied an exception 

895 will be raised. 

896 

897 Returns 

898 ------- 

899 types: `TaskDatasetTypes` 

900 The dataset types used by this task. 

901 

902 Raises 

903 ------ 

904 ValueError 

905 Raised if dataset type connection definition differs from 

906 registry definition. 

907 LookupError 

908 Raised if component parent StorageClass could not be determined 

909 and storage_class_mapping does not contain the composite type, or 

910 is set to None. 

911 """ 

912 

913 def makeDatasetTypesSet( 

914 connectionType: str, 

915 is_input: bool, 

916 freeze: bool = True, 

917 ) -> NamedValueSet[DatasetType]: 

918 """Constructs a set of true `DatasetType` objects 

919 

920 Parameters 

921 ---------- 

922 connectionType : `str` 

923 Name of the connection type to produce a set for, corresponds 

924 to an attribute of type `list` on the connection class instance 

925 is_input : `bool` 

926 These are input dataset types, else they are output dataset 

927 types. 

928 freeze : `bool`, optional 

929 If `True`, call `NamedValueSet.freeze` on the object returned. 

930 

931 Returns 

932 ------- 

933 datasetTypes : `NamedValueSet` 

934 A set of all datasetTypes which correspond to the input 

935 connection type specified in the connection class of this 

936 `PipelineTask` 

937 

938 Raises 

939 ------ 

940 ValueError 

941 Raised if dataset type connection definition differs from 

942 registry definition. 

943 LookupError 

944 Raised if component parent StorageClass could not be determined 

945 and storage_class_mapping does not contain the composite type, 

946 or is set to None. 

947 

948 Notes 

949 ----- 

950 This function is a closure over the variables ``registry`` and 

951 ``taskDef``, and ``storage_class_mapping``. 

952 """ 

953 datasetTypes = NamedValueSet[DatasetType]() 

954 for c in iterConnections(taskDef.connections, connectionType): 

955 dimensions = set(getattr(c, "dimensions", set())) 

956 if "skypix" in dimensions: 

957 try: 

958 datasetType = registry.getDatasetType(c.name) 

959 except LookupError as err: 

960 raise LookupError( 

961 f"DatasetType '{c.name}' referenced by " 

962 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

963 "placeholder, but does not already exist in the registry. " 

964 "Note that reference catalog names are now used as the dataset " 

965 "type name instead of 'ref_cat'." 

966 ) from err 

967 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

968 rest2 = set( 

969 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

970 ) 

971 if rest1 != rest2: 

972 raise ValueError( 

973 f"Non-skypix dimensions for dataset type {c.name} declared in " 

974 f"connections ({rest1}) are inconsistent with those in " 

975 f"registry's version of this dataset ({rest2})." 

976 ) 

977 else: 

978 # Component dataset types are not explicitly in the 

979 # registry. This complicates consistency checks with 

980 # registry and requires we work out the composite storage 

981 # class. 

982 registryDatasetType = None 

983 try: 

984 registryDatasetType = registry.getDatasetType(c.name) 

985 except KeyError: 

986 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

987 if componentName: 

988 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

989 raise LookupError( 

990 "Component parent class cannot be determined, and " 

991 "composite name was not in storage class mapping, or no " 

992 "storage_class_mapping was supplied" 

993 ) 

994 else: 

995 parentStorageClass = storage_class_mapping[compositeName] 

996 else: 

997 parentStorageClass = None 

998 datasetType = c.makeDatasetType( 

999 registry.dimensions, parentStorageClass=parentStorageClass 

1000 ) 

1001 registryDatasetType = datasetType 

1002 else: 

1003 datasetType = c.makeDatasetType( 

1004 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

1005 ) 

1006 

1007 if registryDatasetType and datasetType != registryDatasetType: 

1008 # The dataset types differ but first check to see if 

1009 # they are compatible before raising. 

1010 if is_input: 

1011 # This DatasetType must be compatible on get. 

1012 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

1013 else: 

1014 # Has to be able to be converted to expect type 

1015 # on put. 

1016 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

1017 if is_compatible: 

1018 # For inputs we want the pipeline to use the 

1019 # pipeline definition, for outputs it should use 

1020 # the registry definition. 

1021 if not is_input: 

1022 datasetType = registryDatasetType 

1023 _LOG.debug( 

1024 "Dataset types differ (task %s != registry %s) but are compatible" 

1025 " for %s in %s.", 

1026 datasetType, 

1027 registryDatasetType, 

1028 "input" if is_input else "output", 

1029 taskDef.label, 

1030 ) 

1031 else: 

1032 try: 

1033 # Explicitly check for storage class just to 

1034 # make more specific message. 

1035 _ = datasetType.storageClass 

1036 except KeyError: 

1037 raise ValueError( 

1038 "Storage class does not exist for supplied dataset type " 

1039 f"{datasetType} for {taskDef.label}." 

1040 ) from None 

1041 raise ValueError( 

1042 f"Supplied dataset type ({datasetType}) inconsistent with " 

1043 f"registry definition ({registryDatasetType}) " 

1044 f"for {taskDef.label}." 

1045 ) 

1046 datasetTypes.add(datasetType) 

1047 if freeze: 

1048 datasetTypes.freeze() 

1049 return datasetTypes 

1050 

1051 # optionally add initOutput dataset for config 

1052 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

1053 if include_configs: 

1054 initOutputs.add( 

1055 DatasetType( 

1056 taskDef.configDatasetName, 

1057 registry.dimensions.empty, 

1058 storageClass="Config", 

1059 ) 

1060 ) 

1061 initOutputs.freeze() 

1062 

1063 # optionally add output dataset for metadata 

1064 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

1065 

1066 # Metadata is supposed to be of the TaskMetadata type, its dimensions 

1067 # correspond to a task quantum. 

1068 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1069 

1070 # Allow the storage class definition to be read from the existing 

1071 # dataset type definition if present. 

1072 try: 

1073 current = registry.getDatasetType(taskDef.metadataDatasetName) 

1074 except KeyError: 

1075 # No previous definition so use the default. 

1076 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

1077 else: 

1078 storageClass = current.storageClass.name 

1079 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

1080 

1081 if taskDef.logOutputDatasetName is not None: 

1082 # Log output dimensions correspond to a task quantum. 

1083 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1084 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}) 

1085 

1086 outputs.freeze() 

1087 

1088 inputs = makeDatasetTypesSet("inputs", is_input=True) 

1089 queryConstraints = NamedValueSet( 

1090 inputs[c.name] 

1091 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs")) 

1092 if not c.deferGraphConstraint 

1093 ) 

1094 

1095 return cls( 

1096 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

1097 initOutputs=initOutputs, 

1098 inputs=inputs, 

1099 queryConstraints=queryConstraints, 

1100 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

1101 outputs=outputs, 

1102 ) 

1103 

1104 

1105@dataclass(frozen=True) 

1106class PipelineDatasetTypes: 

1107 """An immutable struct that classifies the dataset types used in a 

1108 `Pipeline`. 

1109 """ 

1110 

1111 packagesDatasetName: ClassVar[str] = "packages" 

1112 """Name of a dataset type used to save package versions. 

1113 """ 

1114 

1115 initInputs: NamedValueSet[DatasetType] 

1116 """Dataset types that are needed as inputs in order to construct the Tasks 

1117 in this Pipeline. 

1118 

1119 This does not include dataset types that are produced when constructing 

1120 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1121 """ 

1122 

1123 initOutputs: NamedValueSet[DatasetType] 

1124 """Dataset types that may be written after constructing the Tasks in this 

1125 Pipeline. 

1126 

1127 This does not include dataset types that are also used as inputs when 

1128 constructing other Tasks in the Pipeline (these are classified as 

1129 `initIntermediates`). 

1130 """ 

1131 

1132 initIntermediates: NamedValueSet[DatasetType] 

1133 """Dataset types that are both used when constructing one or more Tasks 

1134 in the Pipeline and produced as a side-effect of constructing another 

1135 Task in the Pipeline. 

1136 """ 

1137 

1138 inputs: NamedValueSet[DatasetType] 

1139 """Dataset types that are regular inputs for the full pipeline. 

1140 

1141 If an input dataset needed for a Quantum cannot be found in the input 

1142 collection(s), that Quantum (and all dependent Quanta) will not be 

1143 produced. 

1144 """ 

1145 

1146 queryConstraints: NamedValueSet[DatasetType] 

1147 """Regular inputs that should be used as constraints on the initial 

1148 QuantumGraph generation data ID query, according to their tasks 

1149 (`NamedValueSet`). 

1150 """ 

1151 

1152 prerequisites: NamedValueSet[DatasetType] 

1153 """Dataset types that are prerequisite inputs for the full Pipeline. 

1154 

1155 Prerequisite inputs must exist in the input collection(s) before the 

1156 pipeline is run, but do not constrain the graph - if a prerequisite is 

1157 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1158 

1159 Prerequisite inputs are not resolved until the second stage of 

1160 QuantumGraph generation. 

1161 """ 

1162 

1163 intermediates: NamedValueSet[DatasetType] 

1164 """Dataset types that are output by one Task in the Pipeline and consumed 

1165 as inputs by one or more other Tasks in the Pipeline. 

1166 """ 

1167 

1168 outputs: NamedValueSet[DatasetType] 

1169 """Dataset types that are output by a Task in the Pipeline and not consumed 

1170 by any other Task in the Pipeline. 

1171 """ 

1172 

1173 byTask: Mapping[str, TaskDatasetTypes] 

1174 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1175 

1176 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1177 neither has been modified since the dataset types were extracted, of 

1178 course). 

1179 """ 

1180 

1181 @classmethod 

1182 def fromPipeline( 

1183 cls, 

1184 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1185 *, 

1186 registry: Registry, 

1187 include_configs: bool = True, 

1188 include_packages: bool = True, 

1189 ) -> PipelineDatasetTypes: 

1190 """Extract and classify the dataset types from all tasks in a 

1191 `Pipeline`. 

1192 

1193 Parameters 

1194 ---------- 

1195 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1196 A collection of tasks that can be run together. 

1197 registry: `Registry` 

1198 Registry used to construct normalized `DatasetType` objects and 

1199 retrieve those that are incomplete. 

1200 include_configs : `bool`, optional 

1201 If `True` (default) include config dataset types as 

1202 ``initOutputs``. 

1203 include_packages : `bool`, optional 

1204 If `True` (default) include the dataset type for software package 

1205 versions in ``initOutputs``. 

1206 

1207 Returns 

1208 ------- 

1209 types: `PipelineDatasetTypes` 

1210 The dataset types used by this `Pipeline`. 

1211 

1212 Raises 

1213 ------ 

1214 ValueError 

1215 Raised if Tasks are inconsistent about which datasets are marked 

1216 prerequisite. This indicates that the Tasks cannot be run as part 

1217 of the same `Pipeline`. 

1218 """ 

1219 allInputs = NamedValueSet[DatasetType]() 

1220 allOutputs = NamedValueSet[DatasetType]() 

1221 allInitInputs = NamedValueSet[DatasetType]() 

1222 allInitOutputs = NamedValueSet[DatasetType]() 

1223 prerequisites = NamedValueSet[DatasetType]() 

1224 queryConstraints = NamedValueSet[DatasetType]() 

1225 byTask = dict() 

1226 if include_packages: 

1227 allInitOutputs.add( 

1228 DatasetType( 

1229 cls.packagesDatasetName, 

1230 registry.dimensions.empty, 

1231 storageClass="Packages", 

1232 ) 

1233 ) 

1234 # create a list of TaskDefs in case the input is a generator 

1235 pipeline = list(pipeline) 

1236 

1237 # collect all the output dataset types 

1238 typeStorageclassMap: Dict[str, str] = {} 

1239 for taskDef in pipeline: 

1240 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1241 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1242 

1243 for taskDef in pipeline: 

1244 thisTask = TaskDatasetTypes.fromTaskDef( 

1245 taskDef, 

1246 registry=registry, 

1247 include_configs=include_configs, 

1248 storage_class_mapping=typeStorageclassMap, 

1249 ) 

1250 allInitInputs.update(thisTask.initInputs) 

1251 allInitOutputs.update(thisTask.initOutputs) 

1252 allInputs.update(thisTask.inputs) 

1253 # Inputs are query constraints if any task considers them a query 

1254 # constraint. 

1255 queryConstraints.update(thisTask.queryConstraints) 

1256 prerequisites.update(thisTask.prerequisites) 

1257 allOutputs.update(thisTask.outputs) 

1258 byTask[taskDef.label] = thisTask 

1259 if not prerequisites.isdisjoint(allInputs): 

1260 raise ValueError( 

1261 "{} marked as both prerequisites and regular inputs".format( 

1262 {dt.name for dt in allInputs & prerequisites} 

1263 ) 

1264 ) 

1265 if not prerequisites.isdisjoint(allOutputs): 

1266 raise ValueError( 

1267 "{} marked as both prerequisites and outputs".format( 

1268 {dt.name for dt in allOutputs & prerequisites} 

1269 ) 

1270 ) 

1271 # Make sure that components which are marked as inputs get treated as 

1272 # intermediates if there is an output which produces the composite 

1273 # containing the component 

1274 intermediateComponents = NamedValueSet[DatasetType]() 

1275 intermediateComposites = NamedValueSet[DatasetType]() 

1276 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1277 for dsType in allInputs: 

1278 # get the name of a possible component 

1279 name, component = dsType.nameAndComponent() 

1280 # if there is a component name, that means this is a component 

1281 # DatasetType, if there is an output which produces the parent of 

1282 # this component, treat this input as an intermediate 

1283 if component is not None: 

1284 # This needs to be in this if block, because someone might have 

1285 # a composite that is a pure input from existing data 

1286 if name in outputNameMapping: 

1287 intermediateComponents.add(dsType) 

1288 intermediateComposites.add(outputNameMapping[name]) 

1289 

1290 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1291 common = a.names & b.names 

1292 for name in common: 

1293 # Any compatibility is allowed. This function does not know 

1294 # if a dataset type is to be used for input or output. 

1295 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1296 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1297 

1298 checkConsistency(allInitInputs, allInitOutputs) 

1299 checkConsistency(allInputs, allOutputs) 

1300 checkConsistency(allInputs, intermediateComposites) 

1301 checkConsistency(allOutputs, intermediateComposites) 

1302 

1303 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]: 

1304 assert isinstance(s, NamedValueSet) 

1305 s.freeze() 

1306 return s 

1307 

1308 inputs = frozen(allInputs - allOutputs - intermediateComponents) 

1309 

1310 return cls( 

1311 initInputs=frozen(allInitInputs - allInitOutputs), 

1312 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1313 initOutputs=frozen(allInitOutputs - allInitInputs), 

1314 inputs=inputs, 

1315 queryConstraints=frozen(queryConstraints & inputs), 

1316 # If there are storage class differences in inputs and outputs 

1317 # the intermediates have to choose priority. Here choose that 

1318 # inputs to tasks much match the requested storage class by 

1319 # applying the inputs over the top of the outputs. 

1320 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1321 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1322 prerequisites=frozen(prerequisites), 

1323 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1324 ) 

1325 

1326 @classmethod 

1327 def initOutputNames( 

1328 cls, 

1329 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1330 *, 

1331 include_configs: bool = True, 

1332 include_packages: bool = True, 

1333 ) -> Iterator[str]: 

1334 """Return the names of dataset types ot task initOutputs, Configs, 

1335 and package versions for a pipeline. 

1336 

1337 Parameters 

1338 ---------- 

1339 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1340 A `Pipeline` instance or collection of `TaskDef` instances. 

1341 include_configs : `bool`, optional 

1342 If `True` (default) include config dataset types. 

1343 include_packages : `bool`, optional 

1344 If `True` (default) include the dataset type for package versions. 

1345 

1346 Yields 

1347 ------ 

1348 datasetTypeName : `str` 

1349 Name of the dataset type. 

1350 """ 

1351 if include_packages: 

1352 # Package versions dataset type 

1353 yield cls.packagesDatasetName 

1354 

1355 if isinstance(pipeline, Pipeline): 

1356 pipeline = pipeline.toExpandedPipeline() 

1357 

1358 for taskDef in pipeline: 

1359 # all task InitOutputs 

1360 for name in taskDef.connections.initOutputs: 

1361 attribute = getattr(taskDef.connections, name) 

1362 yield attribute.name 

1363 

1364 # config dataset name 

1365 if include_configs: 

1366 yield taskDef.configDatasetName