Coverage for python/lsst/pipe/base/pipeline.py: 20%

451 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-06 02:42 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import os 

31import re 

32import urllib.parse 

33 

34# ------------------------------- 

35# Imports of standard modules -- 

36# ------------------------------- 

37from dataclasses import dataclass 

38from types import MappingProxyType 

39from typing import ( 

40 TYPE_CHECKING, 

41 AbstractSet, 

42 Callable, 

43 ClassVar, 

44 Dict, 

45 Generator, 

46 Iterable, 

47 Iterator, 

48 Mapping, 

49 Optional, 

50 Set, 

51 Tuple, 

52 Type, 

53 Union, 

54 cast, 

55) 

56 

57# ----------------------------- 

58# Imports for other modules -- 

59from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension 

60from lsst.resources import ResourcePath, ResourcePathExpression 

61from lsst.utils import doImportType 

62from lsst.utils.introspection import get_full_type_name 

63 

64from . import pipelineIR, pipeTools 

65from ._task_metadata import TaskMetadata 

66from .config import PipelineTaskConfig 

67from .configOverrides import ConfigOverrides 

68from .connections import iterConnections 

69from .connectionTypes import Input 

70from .pipelineTask import PipelineTask 

71from .task import _TASK_METADATA_TYPE 

72 

73if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true

74 from lsst.obs.base import Instrument 

75 from lsst.pex.config import Config 

76 

77# ---------------------------------- 

78# Local non-exported definitions -- 

79# ---------------------------------- 

80 

81_LOG = logging.getLogger(__name__) 

82 

83# ------------------------ 

84# Exported definitions -- 

85# ------------------------ 

86 

87 

88@dataclass 

89class LabelSpecifier: 

90 """A structure to specify a subset of labels to load 

91 

92 This structure may contain a set of labels to be used in subsetting a 

93 pipeline, or a beginning and end point. Beginning or end may be empty, 

94 in which case the range will be a half open interval. Unlike python 

95 iteration bounds, end bounds are *INCLUDED*. Note that range based 

96 selection is not well defined for pipelines that are not linear in nature, 

97 and correct behavior is not guaranteed, or may vary from run to run. 

98 """ 

99 

100 labels: Optional[Set[str]] = None 

101 begin: Optional[str] = None 

102 end: Optional[str] = None 

103 

104 def __post_init__(self) -> None: 

105 if self.labels is not None and (self.begin or self.end): 

106 raise ValueError( 

107 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

108 ) 

109 

110 

111class TaskDef: 

112 """TaskDef is a collection of information about task needed by Pipeline. 

113 

114 The information includes task name, configuration object and optional 

115 task class. This class is just a collection of attributes and it exposes 

116 all of them so that attributes could potentially be modified in place 

117 (e.g. if configuration needs extra overrides). 

118 

119 Attributes 

120 ---------- 

121 taskName : `str`, optional 

122 The fully-qualified `PipelineTask` class name. If not provided, 

123 ``taskClass`` must be. 

124 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional 

125 Instance of the configuration class corresponding to this task class, 

126 usually with all overrides applied. This config will be frozen. If 

127 not provided, ``taskClass`` must be provided and 

128 ``taskClass.ConfigClass()`` will be used. 

129 taskClass : `type`, optional 

130 `PipelineTask` class object; if provided and ``taskName`` is as well, 

131 the caller guarantees that they are consistent. If not provided, 

132 ``taskName`` is used to import the type. 

133 label : `str`, optional 

134 Task label, usually a short string unique in a pipeline. If not 

135 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

136 be used. 

137 """ 

138 

139 def __init__( 

140 self, 

141 taskName: Optional[str] = None, 

142 config: Optional[PipelineTaskConfig] = None, 

143 taskClass: Optional[Type[PipelineTask]] = None, 

144 label: Optional[str] = None, 

145 ): 

146 if taskName is None: 

147 if taskClass is None: 

148 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

149 taskName = get_full_type_name(taskClass) 

150 elif taskClass is None: 

151 taskClass = doImportType(taskName) 

152 if config is None: 

153 if taskClass is None: 

154 raise ValueError("`taskClass` must be provided if `config` is not.") 

155 config = taskClass.ConfigClass() 

156 if label is None: 

157 if taskClass is None: 

158 raise ValueError("`taskClass` must be provided if `label` is not.") 

159 label = taskClass._DefaultName 

160 self.taskName = taskName 

161 try: 

162 config.validate() 

163 except Exception: 

164 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

165 raise 

166 config.freeze() 

167 self.config = config 

168 self.taskClass = taskClass 

169 self.label = label 

170 self.connections = config.connections.ConnectionsClass(config=config) 

171 

172 @property 

173 def configDatasetName(self) -> str: 

174 """Name of a dataset type for configuration of this task (`str`)""" 

175 return self.label + "_config" 

176 

177 @property 

178 def metadataDatasetName(self) -> Optional[str]: 

179 """Name of a dataset type for metadata of this task, `None` if 

180 metadata is not to be saved (`str`) 

181 """ 

182 if self.config.saveMetadata: 

183 return self.makeMetadataDatasetName(self.label) 

184 else: 

185 return None 

186 

187 @classmethod 

188 def makeMetadataDatasetName(cls, label: str) -> str: 

189 """Construct the name of the dataset type for metadata for a task. 

190 

191 Parameters 

192 ---------- 

193 label : `str` 

194 Label for the task within its pipeline. 

195 

196 Returns 

197 ------- 

198 name : `str` 

199 Name of the task's metadata dataset type. 

200 """ 

201 return f"{label}_metadata" 

202 

203 @property 

204 def logOutputDatasetName(self) -> Optional[str]: 

205 """Name of a dataset type for log output from this task, `None` if 

206 logs are not to be saved (`str`) 

207 """ 

208 if cast(PipelineTaskConfig, self.config).saveLogOutput: 

209 return self.label + "_log" 

210 else: 

211 return None 

212 

213 def __str__(self) -> str: 

214 rep = "TaskDef(" + self.taskName 

215 if self.label: 

216 rep += ", label=" + self.label 

217 rep += ")" 

218 return rep 

219 

220 def __eq__(self, other: object) -> bool: 

221 if not isinstance(other, TaskDef): 

222 return False 

223 # This does not consider equality of configs when determining equality 

224 # as config equality is a difficult thing to define. Should be updated 

225 # after DM-27847 

226 return self.taskClass == other.taskClass and self.label == other.label 

227 

228 def __hash__(self) -> int: 

229 return hash((self.taskClass, self.label)) 

230 

231 @classmethod 

232 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef: 

233 """Custom callable for unpickling. 

234 

235 All arguments are forwarded directly to the constructor; this 

236 trampoline is only needed because ``__reduce__`` callables can't be 

237 called with keyword arguments. 

238 """ 

239 return cls(taskName=taskName, config=config, label=label) 

240 

241 def __reduce__(self) -> Tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], Tuple[str, Config, str]]: 

242 return (self._unreduce, (self.taskName, self.config, self.label)) 

243 

244 

245class Pipeline: 

246 """A `Pipeline` is a representation of a series of tasks to run, and the 

247 configuration for those tasks. 

248 

249 Parameters 

250 ---------- 

251 description : `str` 

252 A description of that this pipeline does. 

253 """ 

254 

255 def __init__(self, description: str): 

256 pipeline_dict = {"description": description, "tasks": {}} 

257 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

258 

259 @classmethod 

260 def fromFile(cls, filename: str) -> Pipeline: 

261 """Load a pipeline defined in a pipeline yaml file. 

262 

263 Parameters 

264 ---------- 

265 filename: `str` 

266 A path that points to a pipeline defined in yaml format. This 

267 filename may also supply additional labels to be used in 

268 subsetting the loaded Pipeline. These labels are separated from 

269 the path by a \\#, and may be specified as a comma separated 

270 list, or a range denoted as beginning..end. Beginning or end may 

271 be empty, in which case the range will be a half open interval. 

272 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

273 that range based selection is not well defined for pipelines that 

274 are not linear in nature, and correct behavior is not guaranteed, 

275 or may vary from run to run. 

276 

277 Returns 

278 ------- 

279 pipeline: `Pipeline` 

280 The pipeline loaded from specified location with appropriate (if 

281 any) subsetting 

282 

283 Notes 

284 ----- 

285 This method attempts to prune any contracts that contain labels which 

286 are not in the declared subset of labels. This pruning is done using a 

287 string based matching due to the nature of contracts and may prune more 

288 than it should. 

289 """ 

290 return cls.from_uri(filename) 

291 

292 @classmethod 

293 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

294 """Load a pipeline defined in a pipeline yaml file at a location 

295 specified by a URI. 

296 

297 Parameters 

298 ---------- 

299 uri: convertible to `ResourcePath` 

300 If a string is supplied this should be a URI path that points to a 

301 pipeline defined in yaml format, either as a direct path to the 

302 yaml file, or as a directory containing a "pipeline.yaml" file (the 

303 form used by `write_to_uri` with ``expand=True``). This uri may 

304 also supply additional labels to be used in subsetting the loaded 

305 Pipeline. These labels are separated from the path by a \\#, and 

306 may be specified as a comma separated list, or a range denoted as 

307 beginning..end. Beginning or end may be empty, in which case the 

308 range will be a half open interval. Unlike python iteration bounds, 

309 end bounds are *INCLUDED*. Note that range based selection is not 

310 well defined for pipelines that are not linear in nature, and 

311 correct behavior is not guaranteed, or may vary from run to run. 

312 The same specifiers can be used with a `ResourcePath` object, by 

313 being the sole contents in the fragments attribute. 

314 

315 Returns 

316 ------- 

317 pipeline: `Pipeline` 

318 The pipeline loaded from specified location with appropriate (if 

319 any) subsetting 

320 

321 Notes 

322 ----- 

323 This method attempts to prune any contracts that contain labels which 

324 are not in the declared subset of labels. This pruning is done using a 

325 string based matching due to the nature of contracts and may prune more 

326 than it should. 

327 """ 

328 # Split up the uri and any labels that were supplied 

329 uri, label_specifier = cls._parse_file_specifier(uri) 

330 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

331 

332 # If there are labels supplied, only keep those 

333 if label_specifier is not None: 

334 pipeline = pipeline.subsetFromLabels(label_specifier) 

335 return pipeline 

336 

337 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

338 """Subset a pipeline to contain only labels specified in labelSpecifier 

339 

340 Parameters 

341 ---------- 

342 labelSpecifier : `labelSpecifier` 

343 Object containing labels that describes how to subset a pipeline. 

344 

345 Returns 

346 ------- 

347 pipeline : `Pipeline` 

348 A new pipeline object that is a subset of the old pipeline 

349 

350 Raises 

351 ------ 

352 ValueError 

353 Raised if there is an issue with specified labels 

354 

355 Notes 

356 ----- 

357 This method attempts to prune any contracts that contain labels which 

358 are not in the declared subset of labels. This pruning is done using a 

359 string based matching due to the nature of contracts and may prune more 

360 than it should. 

361 """ 

362 # Labels supplied as a set 

363 if labelSpecifier.labels: 

364 labelSet = labelSpecifier.labels 

365 # Labels supplied as a range, first create a list of all the labels 

366 # in the pipeline sorted according to task dependency. Then only 

367 # keep labels that lie between the supplied bounds 

368 else: 

369 # Create a copy of the pipeline to use when assessing the label 

370 # ordering. Use a dict for fast searching while preserving order. 

371 # Remove contracts so they do not fail in the expansion step. This 

372 # is needed because a user may only configure the tasks they intend 

373 # to run, which may cause some contracts to fail if they will later 

374 # be dropped 

375 pipeline = copy.deepcopy(self) 

376 pipeline._pipelineIR.contracts = [] 

377 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

378 

379 # Verify the bounds are in the labels 

380 if labelSpecifier.begin is not None: 

381 if labelSpecifier.begin not in labels: 

382 raise ValueError( 

383 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition" 

384 ) 

385 if labelSpecifier.end is not None: 

386 if labelSpecifier.end not in labels: 

387 raise ValueError( 

388 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

389 ) 

390 

391 labelSet = set() 

392 for label in labels: 

393 if labelSpecifier.begin is not None: 

394 if label != labelSpecifier.begin: 

395 continue 

396 else: 

397 labelSpecifier.begin = None 

398 labelSet.add(label) 

399 if labelSpecifier.end is not None and label == labelSpecifier.end: 

400 break 

401 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

402 

403 @staticmethod 

404 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]: 

405 """Split appart a uri and any possible label subsets""" 

406 if isinstance(uri, str): 

407 # This is to support legacy pipelines during transition 

408 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

409 if num_replace: 

410 raise ValueError( 

411 f"The pipeline file {uri} seems to use the legacy :" 

412 " to separate labels, please use # instead." 

413 ) 

414 if uri.count("#") > 1: 

415 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

416 # Everything else can be converted directly to ResourcePath. 

417 uri = ResourcePath(uri) 

418 label_subset = uri.fragment or None 

419 

420 specifier: Optional[LabelSpecifier] 

421 if label_subset is not None: 

422 label_subset = urllib.parse.unquote(label_subset) 

423 args: Dict[str, Union[Set[str], str, None]] 

424 # labels supplied as a list 

425 if "," in label_subset: 

426 if ".." in label_subset: 

427 raise ValueError( 

428 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

429 ) 

430 args = {"labels": set(label_subset.split(","))} 

431 # labels supplied as a range 

432 elif ".." in label_subset: 

433 # Try to de-structure the labelSubset, this will fail if more 

434 # than one range is specified 

435 begin, end, *rest = label_subset.split("..") 

436 if rest: 

437 raise ValueError("Only one range can be specified when loading a pipeline") 

438 args = {"begin": begin if begin else None, "end": end if end else None} 

439 # Assume anything else is a single label 

440 else: 

441 args = {"labels": {label_subset}} 

442 

443 # MyPy doesn't like how cavalier kwarg construction is with types. 

444 specifier = LabelSpecifier(**args) # type: ignore 

445 else: 

446 specifier = None 

447 

448 return uri, specifier 

449 

450 @classmethod 

451 def fromString(cls, pipeline_string: str) -> Pipeline: 

452 """Create a pipeline from string formatted as a pipeline document. 

453 

454 Parameters 

455 ---------- 

456 pipeline_string : `str` 

457 A string that is formatted according like a pipeline document 

458 

459 Returns 

460 ------- 

461 pipeline: `Pipeline` 

462 """ 

463 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

464 return pipeline 

465 

466 @classmethod 

467 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

468 """Create a pipeline from an already created `PipelineIR` object. 

469 

470 Parameters 

471 ---------- 

472 deserialized_pipeline: `PipelineIR` 

473 An already created pipeline intermediate representation object 

474 

475 Returns 

476 ------- 

477 pipeline: `Pipeline` 

478 """ 

479 pipeline = cls.__new__(cls) 

480 pipeline._pipelineIR = deserialized_pipeline 

481 return pipeline 

482 

483 @classmethod 

484 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

485 """Create a new pipeline by copying an already existing `Pipeline`. 

486 

487 Parameters 

488 ---------- 

489 pipeline: `Pipeline` 

490 An already created pipeline intermediate representation object 

491 

492 Returns 

493 ------- 

494 pipeline: `Pipeline` 

495 """ 

496 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

497 

498 def __str__(self) -> str: 

499 return str(self._pipelineIR) 

500 

501 def mergePipeline(self, pipeline: Pipeline) -> None: 

502 """Merge another in-memory `Pipeline` object into this one. 

503 

504 This merges another pipeline into this object, as if it were declared 

505 in the import block of the yaml definition of this pipeline. This 

506 modifies this pipeline in place. 

507 

508 Parameters 

509 ---------- 

510 pipeline : `Pipeline` 

511 The `Pipeline` object that is to be merged into this object. 

512 """ 

513 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,)) 

514 

515 def addLabelToSubset(self, subset: str, label: str) -> None: 

516 """Add a task label from the specified subset. 

517 

518 Parameters 

519 ---------- 

520 subset : `str` 

521 The labeled subset to modify 

522 label : `str` 

523 The task label to add to the specified subset. 

524 

525 Raises 

526 ------ 

527 ValueError 

528 Raised if the specified subset does not exist within the pipeline. 

529 Raised if the specified label does not exist within the pipeline. 

530 """ 

531 if label not in self._pipelineIR.tasks: 

532 raise ValueError(f"Label {label} does not appear within the pipeline") 

533 if subset not in self._pipelineIR.labeled_subsets: 

534 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

535 self._pipelineIR.labeled_subsets[subset].subset.add(label) 

536 

537 def removeLabelFromSubset(self, subset: str, label: str) -> None: 

538 """Remove a task label from the specified subset. 

539 

540 Parameters 

541 ---------- 

542 subset : `str` 

543 The labeled subset to modify 

544 label : `str` 

545 The task label to remove from the specified subset. 

546 

547 Raises 

548 ------ 

549 ValueError 

550 Raised if the specified subset does not exist in the pipeline. 

551 Raised if the specified label does not exist within the specified 

552 subset. 

553 """ 

554 if subset not in self._pipelineIR.labeled_subsets: 

555 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

556 if label not in self._pipelineIR.labeled_subsets[subset].subset: 

557 raise ValueError(f"Label {label} does not appear within the pipeline") 

558 self._pipelineIR.labeled_subsets[subset].subset.remove(label) 

559 

560 def findSubsetsWithLabel(self, label: str) -> set[str]: 

561 """Find any subsets which may contain the specified label. 

562 

563 This function returns the name of subsets which return the specified 

564 label. May return an empty set if there are no subsets, or no subsets 

565 containing the specified label. 

566 

567 Parameters 

568 ---------- 

569 label : `str` 

570 The task label to use in membership check 

571 

572 Returns 

573 ------- 

574 subsets : `set` of `str` 

575 Returns a set (possibly empty) of subsets names which contain the 

576 specified label. 

577 

578 Raises 

579 ------ 

580 ValueError 

581 Raised if the specified label does not exist within this pipeline. 

582 """ 

583 results = set() 

584 if label not in self._pipelineIR.tasks: 

585 raise ValueError(f"Label {label} does not appear within the pipeline") 

586 for subset in self._pipelineIR.labeled_subsets.values(): 

587 if label in subset.subset: 

588 results.add(subset.label) 

589 return results 

590 

591 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

592 """Add an instrument to the pipeline, or replace an instrument that is 

593 already defined. 

594 

595 Parameters 

596 ---------- 

597 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

598 Either a derived class object of a `lsst.daf.butler.instrument` or 

599 a string corresponding to a fully qualified 

600 `lsst.daf.butler.instrument` name. 

601 """ 

602 if isinstance(instrument, str): 

603 pass 

604 else: 

605 # TODO: assume that this is a subclass of Instrument, no type 

606 # checking 

607 instrument = get_full_type_name(instrument) 

608 self._pipelineIR.instrument = instrument 

609 

610 def getInstrument(self) -> Optional[str]: 

611 """Get the instrument from the pipeline. 

612 

613 Returns 

614 ------- 

615 instrument : `str`, or None 

616 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

617 name, or None if the pipeline does not have an instrument. 

618 """ 

619 return self._pipelineIR.instrument 

620 

621 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None: 

622 """Add a new task to the pipeline, or replace a task that is already 

623 associated with the supplied label. 

624 

625 Parameters 

626 ---------- 

627 task: `PipelineTask` or `str` 

628 Either a derived class object of a `PipelineTask` or a string 

629 corresponding to a fully qualified `PipelineTask` name. 

630 label: `str` 

631 A label that is used to identify the `PipelineTask` being added 

632 """ 

633 if isinstance(task, str): 

634 taskName = task 

635 elif issubclass(task, PipelineTask): 

636 taskName = get_full_type_name(task) 

637 else: 

638 raise ValueError( 

639 "task must be either a child class of PipelineTask or a string containing" 

640 " a fully qualified name to one" 

641 ) 

642 if not label: 

643 # in some cases (with command line-generated pipeline) tasks can 

644 # be defined without label which is not acceptable, use task 

645 # _DefaultName in that case 

646 if isinstance(task, str): 

647 task_class = doImportType(task) 

648 label = task_class._DefaultName 

649 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

650 

651 def removeTask(self, label: str) -> None: 

652 """Remove a task from the pipeline. 

653 

654 Parameters 

655 ---------- 

656 label : `str` 

657 The label used to identify the task that is to be removed 

658 

659 Raises 

660 ------ 

661 KeyError 

662 If no task with that label exists in the pipeline 

663 

664 """ 

665 self._pipelineIR.tasks.pop(label) 

666 

667 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

668 """Apply single config override. 

669 

670 Parameters 

671 ---------- 

672 label : `str` 

673 Label of the task. 

674 key: `str` 

675 Fully-qualified field name. 

676 value : object 

677 Value to be given to a field. 

678 """ 

679 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

680 

681 def addConfigFile(self, label: str, filename: str) -> None: 

682 """Add overrides from a specified file. 

683 

684 Parameters 

685 ---------- 

686 label : `str` 

687 The label used to identify the task associated with config to 

688 modify 

689 filename : `str` 

690 Path to the override file. 

691 """ 

692 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

693 

694 def addConfigPython(self, label: str, pythonString: str) -> None: 

695 """Add Overrides by running a snippet of python code against a config. 

696 

697 Parameters 

698 ---------- 

699 label : `str` 

700 The label used to identity the task associated with config to 

701 modify. 

702 pythonString: `str` 

703 A string which is valid python code to be executed. This is done 

704 with config as the only local accessible value. 

705 """ 

706 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

707 

708 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

709 if label == "parameters": 

710 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

711 raise ValueError("Cannot override parameters that are not defined in pipeline") 

712 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

713 if newConfig.file: 

714 raise ValueError("Setting parameters section with config file is not supported") 

715 if newConfig.python: 

716 raise ValueError("Setting parameters section using python block in unsupported") 

717 return 

718 if label not in self._pipelineIR.tasks: 

719 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

720 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

721 

722 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

723 """Write the pipeline to a file or directory. 

724 

725 Parameters 

726 ---------- 

727 uri : convertible to `ResourcePath` 

728 URI to write to; may have any scheme with `ResourcePath` write 

729 support or no scheme for a local file/directory. Should have a 

730 ``.yaml``. 

731 """ 

732 self._pipelineIR.write_to_uri(uri) 

733 

734 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

735 """Returns a generator of TaskDefs which can be used to create quantum 

736 graphs. 

737 

738 Returns 

739 ------- 

740 generator : generator of `TaskDef` 

741 The generator returned will be the sorted iterator of tasks which 

742 are to be used in constructing a quantum graph. 

743 

744 Raises 

745 ------ 

746 NotImplementedError 

747 If a dataId is supplied in a config block. This is in place for 

748 future use 

749 """ 

750 taskDefs = [] 

751 for label in self._pipelineIR.tasks: 

752 taskDefs.append(self._buildTaskDef(label)) 

753 

754 # lets evaluate the contracts 

755 if self._pipelineIR.contracts is not None: 

756 label_to_config = {x.label: x.config for x in taskDefs} 

757 for contract in self._pipelineIR.contracts: 

758 # execute this in its own line so it can raise a good error 

759 # message if there was problems with the eval 

760 success = eval(contract.contract, None, label_to_config) 

761 if not success: 

762 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

763 raise pipelineIR.ContractError( 

764 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

765 ) 

766 

767 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

768 yield from pipeTools.orderPipeline(taskDefs) 

769 

770 def _buildTaskDef(self, label: str) -> TaskDef: 

771 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

772 raise NameError(f"Label {label} does not appear in this pipeline") 

773 taskClass: Type[PipelineTask] = doImportType(taskIR.klass) 

774 taskName = get_full_type_name(taskClass) 

775 config = taskClass.ConfigClass() 

776 overrides = ConfigOverrides() 

777 if self._pipelineIR.instrument is not None: 

778 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

779 if taskIR.config is not None: 

780 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config): 

781 if configIR.dataId is not None: 

782 raise NotImplementedError( 

783 "Specializing a config on a partial data id is not yet " 

784 "supported in Pipeline definition" 

785 ) 

786 # only apply override if it applies to everything 

787 if configIR.dataId is None: 

788 if configIR.file: 

789 for configFile in configIR.file: 

790 overrides.addFileOverride(os.path.expandvars(configFile)) 

791 if configIR.python is not None: 

792 overrides.addPythonOverride(configIR.python) 

793 for key, value in configIR.rest.items(): 

794 overrides.addValueOverride(key, value) 

795 overrides.applyTo(config) 

796 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

797 

798 def __iter__(self) -> Generator[TaskDef, None, None]: 

799 return self.toExpandedPipeline() 

800 

801 def __getitem__(self, item: str) -> TaskDef: 

802 return self._buildTaskDef(item) 

803 

804 def __len__(self) -> int: 

805 return len(self._pipelineIR.tasks) 

806 

807 def __eq__(self, other: object) -> bool: 

808 if not isinstance(other, Pipeline): 

809 return False 

810 elif self._pipelineIR == other._pipelineIR: 

811 # Shortcut: if the IR is the same, the expanded pipeline must be 

812 # the same as well. But the converse is not true. 

813 return True 

814 else: 

815 self_expanded = {td.label: (td.taskClass,) for td in self} 

816 other_expanded = {td.label: (td.taskClass,) for td in other} 

817 if self_expanded != other_expanded: 

818 return False 

819 # After DM-27847, we should compare configuration here, or better, 

820 # delegated to TaskDef.__eq__ after making that compare configurations. 

821 raise NotImplementedError( 

822 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847." 

823 ) 

824 

825 

826@dataclass(frozen=True) 

827class TaskDatasetTypes: 

828 """An immutable struct that extracts and classifies the dataset types used 

829 by a `PipelineTask` 

830 """ 

831 

832 initInputs: NamedValueSet[DatasetType] 

833 """Dataset types that are needed as inputs in order to construct this Task. 

834 

835 Task-level `initInputs` may be classified as either 

836 `~PipelineDatasetTypes.initInputs` or 

837 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

838 """ 

839 

840 initOutputs: NamedValueSet[DatasetType] 

841 """Dataset types that may be written after constructing this Task. 

842 

843 Task-level `initOutputs` may be classified as either 

844 `~PipelineDatasetTypes.initOutputs` or 

845 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

846 """ 

847 

848 inputs: NamedValueSet[DatasetType] 

849 """Dataset types that are regular inputs to this Task. 

850 

851 If an input dataset needed for a Quantum cannot be found in the input 

852 collection(s) or produced by another Task in the Pipeline, that Quantum 

853 (and all dependent Quanta) will not be produced. 

854 

855 Task-level `inputs` may be classified as either 

856 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

857 at the Pipeline level. 

858 """ 

859 

860 queryConstraints: NamedValueSet[DatasetType] 

861 """Regular inputs that should not be used as constraints on the initial 

862 QuantumGraph generation data ID query, according to their tasks 

863 (`NamedValueSet`). 

864 """ 

865 

866 prerequisites: NamedValueSet[DatasetType] 

867 """Dataset types that are prerequisite inputs to this Task. 

868 

869 Prerequisite inputs must exist in the input collection(s) before the 

870 pipeline is run, but do not constrain the graph - if a prerequisite is 

871 missing for a Quantum, `PrerequisiteMissingError` is raised. 

872 

873 Prerequisite inputs are not resolved until the second stage of 

874 QuantumGraph generation. 

875 """ 

876 

877 outputs: NamedValueSet[DatasetType] 

878 """Dataset types that are produced by this Task. 

879 

880 Task-level `outputs` may be classified as either 

881 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

882 at the Pipeline level. 

883 """ 

884 

885 @classmethod 

886 def fromTaskDef( 

887 cls, 

888 taskDef: TaskDef, 

889 *, 

890 registry: Registry, 

891 include_configs: bool = True, 

892 storage_class_mapping: Optional[Mapping[str, str]] = None, 

893 ) -> TaskDatasetTypes: 

894 """Extract and classify the dataset types from a single `PipelineTask`. 

895 

896 Parameters 

897 ---------- 

898 taskDef: `TaskDef` 

899 An instance of a `TaskDef` class for a particular `PipelineTask`. 

900 registry: `Registry` 

901 Registry used to construct normalized `DatasetType` objects and 

902 retrieve those that are incomplete. 

903 include_configs : `bool`, optional 

904 If `True` (default) include config dataset types as 

905 ``initOutputs``. 

906 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

907 If a taskdef contains a component dataset type that is unknown 

908 to the registry, its parent StorageClass will be looked up in this 

909 mapping if it is supplied. If the mapping does not contain the 

910 composite dataset type, or the mapping is not supplied an exception 

911 will be raised. 

912 

913 Returns 

914 ------- 

915 types: `TaskDatasetTypes` 

916 The dataset types used by this task. 

917 

918 Raises 

919 ------ 

920 ValueError 

921 Raised if dataset type connection definition differs from 

922 registry definition. 

923 LookupError 

924 Raised if component parent StorageClass could not be determined 

925 and storage_class_mapping does not contain the composite type, or 

926 is set to None. 

927 """ 

928 

929 def makeDatasetTypesSet( 

930 connectionType: str, 

931 is_input: bool, 

932 freeze: bool = True, 

933 ) -> NamedValueSet[DatasetType]: 

934 """Constructs a set of true `DatasetType` objects 

935 

936 Parameters 

937 ---------- 

938 connectionType : `str` 

939 Name of the connection type to produce a set for, corresponds 

940 to an attribute of type `list` on the connection class instance 

941 is_input : `bool` 

942 These are input dataset types, else they are output dataset 

943 types. 

944 freeze : `bool`, optional 

945 If `True`, call `NamedValueSet.freeze` on the object returned. 

946 

947 Returns 

948 ------- 

949 datasetTypes : `NamedValueSet` 

950 A set of all datasetTypes which correspond to the input 

951 connection type specified in the connection class of this 

952 `PipelineTask` 

953 

954 Raises 

955 ------ 

956 ValueError 

957 Raised if dataset type connection definition differs from 

958 registry definition. 

959 LookupError 

960 Raised if component parent StorageClass could not be determined 

961 and storage_class_mapping does not contain the composite type, 

962 or is set to None. 

963 

964 Notes 

965 ----- 

966 This function is a closure over the variables ``registry`` and 

967 ``taskDef``, and ``storage_class_mapping``. 

968 """ 

969 datasetTypes = NamedValueSet[DatasetType]() 

970 for c in iterConnections(taskDef.connections, connectionType): 

971 dimensions = set(getattr(c, "dimensions", set())) 

972 if "skypix" in dimensions: 

973 try: 

974 datasetType = registry.getDatasetType(c.name) 

975 except LookupError as err: 

976 raise LookupError( 

977 f"DatasetType '{c.name}' referenced by " 

978 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

979 "placeholder, but does not already exist in the registry. " 

980 "Note that reference catalog names are now used as the dataset " 

981 "type name instead of 'ref_cat'." 

982 ) from err 

983 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

984 rest2 = set( 

985 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

986 ) 

987 if rest1 != rest2: 

988 raise ValueError( 

989 f"Non-skypix dimensions for dataset type {c.name} declared in " 

990 f"connections ({rest1}) are inconsistent with those in " 

991 f"registry's version of this dataset ({rest2})." 

992 ) 

993 else: 

994 # Component dataset types are not explicitly in the 

995 # registry. This complicates consistency checks with 

996 # registry and requires we work out the composite storage 

997 # class. 

998 registryDatasetType = None 

999 try: 

1000 registryDatasetType = registry.getDatasetType(c.name) 

1001 except KeyError: 

1002 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

1003 if componentName: 

1004 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

1005 raise LookupError( 

1006 "Component parent class cannot be determined, and " 

1007 "composite name was not in storage class mapping, or no " 

1008 "storage_class_mapping was supplied" 

1009 ) 

1010 else: 

1011 parentStorageClass = storage_class_mapping[compositeName] 

1012 else: 

1013 parentStorageClass = None 

1014 datasetType = c.makeDatasetType( 

1015 registry.dimensions, parentStorageClass=parentStorageClass 

1016 ) 

1017 registryDatasetType = datasetType 

1018 else: 

1019 datasetType = c.makeDatasetType( 

1020 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

1021 ) 

1022 

1023 if registryDatasetType and datasetType != registryDatasetType: 

1024 # The dataset types differ but first check to see if 

1025 # they are compatible before raising. 

1026 if is_input: 

1027 # This DatasetType must be compatible on get. 

1028 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

1029 else: 

1030 # Has to be able to be converted to expect type 

1031 # on put. 

1032 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

1033 if is_compatible: 

1034 # For inputs we want the pipeline to use the 

1035 # pipeline definition, for outputs it should use 

1036 # the registry definition. 

1037 if not is_input: 

1038 datasetType = registryDatasetType 

1039 _LOG.debug( 

1040 "Dataset types differ (task %s != registry %s) but are compatible" 

1041 " for %s in %s.", 

1042 datasetType, 

1043 registryDatasetType, 

1044 "input" if is_input else "output", 

1045 taskDef.label, 

1046 ) 

1047 else: 

1048 try: 

1049 # Explicitly check for storage class just to 

1050 # make more specific message. 

1051 _ = datasetType.storageClass 

1052 except KeyError: 

1053 raise ValueError( 

1054 "Storage class does not exist for supplied dataset type " 

1055 f"{datasetType} for {taskDef.label}." 

1056 ) from None 

1057 raise ValueError( 

1058 f"Supplied dataset type ({datasetType}) inconsistent with " 

1059 f"registry definition ({registryDatasetType}) " 

1060 f"for {taskDef.label}." 

1061 ) 

1062 datasetTypes.add(datasetType) 

1063 if freeze: 

1064 datasetTypes.freeze() 

1065 return datasetTypes 

1066 

1067 # optionally add initOutput dataset for config 

1068 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

1069 if include_configs: 

1070 initOutputs.add( 

1071 DatasetType( 

1072 taskDef.configDatasetName, 

1073 registry.dimensions.empty, 

1074 storageClass="Config", 

1075 ) 

1076 ) 

1077 initOutputs.freeze() 

1078 

1079 # optionally add output dataset for metadata 

1080 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

1081 if taskDef.metadataDatasetName is not None: 

1082 # Metadata is supposed to be of the TaskMetadata type, its 

1083 # dimensions correspond to a task quantum. 

1084 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1085 

1086 # Allow the storage class definition to be read from the existing 

1087 # dataset type definition if present. 

1088 try: 

1089 current = registry.getDatasetType(taskDef.metadataDatasetName) 

1090 except KeyError: 

1091 # No previous definition so use the default. 

1092 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

1093 else: 

1094 storageClass = current.storageClass.name 

1095 

1096 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

1097 if taskDef.logOutputDatasetName is not None: 

1098 # Log output dimensions correspond to a task quantum. 

1099 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1100 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}) 

1101 

1102 outputs.freeze() 

1103 

1104 inputs = makeDatasetTypesSet("inputs", is_input=True) 

1105 queryConstraints = NamedValueSet( 

1106 inputs[c.name] 

1107 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs")) 

1108 if not c.deferGraphConstraint 

1109 ) 

1110 

1111 return cls( 

1112 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

1113 initOutputs=initOutputs, 

1114 inputs=inputs, 

1115 queryConstraints=queryConstraints, 

1116 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

1117 outputs=outputs, 

1118 ) 

1119 

1120 

1121@dataclass(frozen=True) 

1122class PipelineDatasetTypes: 

1123 """An immutable struct that classifies the dataset types used in a 

1124 `Pipeline`. 

1125 """ 

1126 

1127 packagesDatasetName: ClassVar[str] = "packages" 

1128 """Name of a dataset type used to save package versions. 

1129 """ 

1130 

1131 initInputs: NamedValueSet[DatasetType] 

1132 """Dataset types that are needed as inputs in order to construct the Tasks 

1133 in this Pipeline. 

1134 

1135 This does not include dataset types that are produced when constructing 

1136 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1137 """ 

1138 

1139 initOutputs: NamedValueSet[DatasetType] 

1140 """Dataset types that may be written after constructing the Tasks in this 

1141 Pipeline. 

1142 

1143 This does not include dataset types that are also used as inputs when 

1144 constructing other Tasks in the Pipeline (these are classified as 

1145 `initIntermediates`). 

1146 """ 

1147 

1148 initIntermediates: NamedValueSet[DatasetType] 

1149 """Dataset types that are both used when constructing one or more Tasks 

1150 in the Pipeline and produced as a side-effect of constructing another 

1151 Task in the Pipeline. 

1152 """ 

1153 

1154 inputs: NamedValueSet[DatasetType] 

1155 """Dataset types that are regular inputs for the full pipeline. 

1156 

1157 If an input dataset needed for a Quantum cannot be found in the input 

1158 collection(s), that Quantum (and all dependent Quanta) will not be 

1159 produced. 

1160 """ 

1161 

1162 queryConstraints: NamedValueSet[DatasetType] 

1163 """Regular inputs that should be used as constraints on the initial 

1164 QuantumGraph generation data ID query, according to their tasks 

1165 (`NamedValueSet`). 

1166 """ 

1167 

1168 prerequisites: NamedValueSet[DatasetType] 

1169 """Dataset types that are prerequisite inputs for the full Pipeline. 

1170 

1171 Prerequisite inputs must exist in the input collection(s) before the 

1172 pipeline is run, but do not constrain the graph - if a prerequisite is 

1173 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1174 

1175 Prerequisite inputs are not resolved until the second stage of 

1176 QuantumGraph generation. 

1177 """ 

1178 

1179 intermediates: NamedValueSet[DatasetType] 

1180 """Dataset types that are output by one Task in the Pipeline and consumed 

1181 as inputs by one or more other Tasks in the Pipeline. 

1182 """ 

1183 

1184 outputs: NamedValueSet[DatasetType] 

1185 """Dataset types that are output by a Task in the Pipeline and not consumed 

1186 by any other Task in the Pipeline. 

1187 """ 

1188 

1189 byTask: Mapping[str, TaskDatasetTypes] 

1190 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1191 

1192 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1193 neither has been modified since the dataset types were extracted, of 

1194 course). 

1195 """ 

1196 

1197 @classmethod 

1198 def fromPipeline( 

1199 cls, 

1200 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1201 *, 

1202 registry: Registry, 

1203 include_configs: bool = True, 

1204 include_packages: bool = True, 

1205 ) -> PipelineDatasetTypes: 

1206 """Extract and classify the dataset types from all tasks in a 

1207 `Pipeline`. 

1208 

1209 Parameters 

1210 ---------- 

1211 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1212 A collection of tasks that can be run together. 

1213 registry: `Registry` 

1214 Registry used to construct normalized `DatasetType` objects and 

1215 retrieve those that are incomplete. 

1216 include_configs : `bool`, optional 

1217 If `True` (default) include config dataset types as 

1218 ``initOutputs``. 

1219 include_packages : `bool`, optional 

1220 If `True` (default) include the dataset type for software package 

1221 versions in ``initOutputs``. 

1222 

1223 Returns 

1224 ------- 

1225 types: `PipelineDatasetTypes` 

1226 The dataset types used by this `Pipeline`. 

1227 

1228 Raises 

1229 ------ 

1230 ValueError 

1231 Raised if Tasks are inconsistent about which datasets are marked 

1232 prerequisite. This indicates that the Tasks cannot be run as part 

1233 of the same `Pipeline`. 

1234 """ 

1235 allInputs = NamedValueSet[DatasetType]() 

1236 allOutputs = NamedValueSet[DatasetType]() 

1237 allInitInputs = NamedValueSet[DatasetType]() 

1238 allInitOutputs = NamedValueSet[DatasetType]() 

1239 prerequisites = NamedValueSet[DatasetType]() 

1240 queryConstraints = NamedValueSet[DatasetType]() 

1241 byTask = dict() 

1242 if include_packages: 

1243 allInitOutputs.add( 

1244 DatasetType( 

1245 cls.packagesDatasetName, 

1246 registry.dimensions.empty, 

1247 storageClass="Packages", 

1248 ) 

1249 ) 

1250 # create a list of TaskDefs in case the input is a generator 

1251 pipeline = list(pipeline) 

1252 

1253 # collect all the output dataset types 

1254 typeStorageclassMap: Dict[str, str] = {} 

1255 for taskDef in pipeline: 

1256 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1257 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1258 

1259 for taskDef in pipeline: 

1260 thisTask = TaskDatasetTypes.fromTaskDef( 

1261 taskDef, 

1262 registry=registry, 

1263 include_configs=include_configs, 

1264 storage_class_mapping=typeStorageclassMap, 

1265 ) 

1266 allInitInputs.update(thisTask.initInputs) 

1267 allInitOutputs.update(thisTask.initOutputs) 

1268 allInputs.update(thisTask.inputs) 

1269 # Inputs are query constraints if any task considers them a query 

1270 # constraint. 

1271 queryConstraints.update(thisTask.queryConstraints) 

1272 prerequisites.update(thisTask.prerequisites) 

1273 allOutputs.update(thisTask.outputs) 

1274 byTask[taskDef.label] = thisTask 

1275 if not prerequisites.isdisjoint(allInputs): 

1276 raise ValueError( 

1277 "{} marked as both prerequisites and regular inputs".format( 

1278 {dt.name for dt in allInputs & prerequisites} 

1279 ) 

1280 ) 

1281 if not prerequisites.isdisjoint(allOutputs): 

1282 raise ValueError( 

1283 "{} marked as both prerequisites and outputs".format( 

1284 {dt.name for dt in allOutputs & prerequisites} 

1285 ) 

1286 ) 

1287 # Make sure that components which are marked as inputs get treated as 

1288 # intermediates if there is an output which produces the composite 

1289 # containing the component 

1290 intermediateComponents = NamedValueSet[DatasetType]() 

1291 intermediateComposites = NamedValueSet[DatasetType]() 

1292 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1293 for dsType in allInputs: 

1294 # get the name of a possible component 

1295 name, component = dsType.nameAndComponent() 

1296 # if there is a component name, that means this is a component 

1297 # DatasetType, if there is an output which produces the parent of 

1298 # this component, treat this input as an intermediate 

1299 if component is not None: 

1300 # This needs to be in this if block, because someone might have 

1301 # a composite that is a pure input from existing data 

1302 if name in outputNameMapping: 

1303 intermediateComponents.add(dsType) 

1304 intermediateComposites.add(outputNameMapping[name]) 

1305 

1306 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1307 common = a.names & b.names 

1308 for name in common: 

1309 # Any compatibility is allowed. This function does not know 

1310 # if a dataset type is to be used for input or output. 

1311 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1312 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1313 

1314 checkConsistency(allInitInputs, allInitOutputs) 

1315 checkConsistency(allInputs, allOutputs) 

1316 checkConsistency(allInputs, intermediateComposites) 

1317 checkConsistency(allOutputs, intermediateComposites) 

1318 

1319 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]: 

1320 assert isinstance(s, NamedValueSet) 

1321 s.freeze() 

1322 return s 

1323 

1324 inputs = frozen(allInputs - allOutputs - intermediateComponents) 

1325 

1326 return cls( 

1327 initInputs=frozen(allInitInputs - allInitOutputs), 

1328 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1329 initOutputs=frozen(allInitOutputs - allInitInputs), 

1330 inputs=inputs, 

1331 queryConstraints=frozen(queryConstraints & inputs), 

1332 # If there are storage class differences in inputs and outputs 

1333 # the intermediates have to choose priority. Here choose that 

1334 # inputs to tasks much match the requested storage class by 

1335 # applying the inputs over the top of the outputs. 

1336 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1337 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1338 prerequisites=frozen(prerequisites), 

1339 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1340 ) 

1341 

1342 @classmethod 

1343 def initOutputNames( 

1344 cls, 

1345 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1346 *, 

1347 include_configs: bool = True, 

1348 include_packages: bool = True, 

1349 ) -> Iterator[str]: 

1350 """Return the names of dataset types ot task initOutputs, Configs, 

1351 and package versions for a pipeline. 

1352 

1353 Parameters 

1354 ---------- 

1355 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1356 A `Pipeline` instance or collection of `TaskDef` instances. 

1357 include_configs : `bool`, optional 

1358 If `True` (default) include config dataset types. 

1359 include_packages : `bool`, optional 

1360 If `True` (default) include the dataset type for package versions. 

1361 

1362 Yields 

1363 ------ 

1364 datasetTypeName : `str` 

1365 Name of the dataset type. 

1366 """ 

1367 if include_packages: 

1368 # Package versions dataset type 

1369 yield cls.packagesDatasetName 

1370 

1371 if isinstance(pipeline, Pipeline): 

1372 pipeline = pipeline.toExpandedPipeline() 

1373 

1374 for taskDef in pipeline: 

1375 # all task InitOutputs 

1376 for name in taskDef.connections.initOutputs: 

1377 attribute = getattr(taskDef.connections, name) 

1378 yield attribute.name 

1379 

1380 # config dataset name 

1381 if include_configs: 

1382 yield taskDef.configDatasetName