Coverage for python/lsst/pipe/base/pipeline.py: 19%

442 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-01 02:07 -0800

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import os 

31import re 

32import urllib.parse 

33import warnings 

34 

35# ------------------------------- 

36# Imports of standard modules -- 

37# ------------------------------- 

38from dataclasses import dataclass 

39from types import MappingProxyType 

40from typing import ( 

41 TYPE_CHECKING, 

42 AbstractSet, 

43 Callable, 

44 ClassVar, 

45 Dict, 

46 Generator, 

47 Iterable, 

48 Iterator, 

49 Mapping, 

50 Optional, 

51 Set, 

52 Tuple, 

53 Type, 

54 Union, 

55 cast, 

56) 

57 

58# ----------------------------- 

59# Imports for other modules -- 

60from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension 

61from lsst.resources import ResourcePath, ResourcePathExpression 

62from lsst.utils import doImportType 

63from lsst.utils.introspection import get_full_type_name 

64 

65from . import pipelineIR, pipeTools 

66from ._task_metadata import TaskMetadata 

67from .config import PipelineTaskConfig 

68from .configOverrides import ConfigOverrides 

69from .connections import iterConnections 

70from .pipelineTask import PipelineTask 

71from .task import _TASK_METADATA_TYPE 

72 

73if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true

74 from lsst.obs.base import Instrument 

75 from lsst.pex.config import Config 

76 

77# ---------------------------------- 

78# Local non-exported definitions -- 

79# ---------------------------------- 

80 

81_LOG = logging.getLogger(__name__) 

82 

83# ------------------------ 

84# Exported definitions -- 

85# ------------------------ 

86 

87 

88@dataclass 

89class LabelSpecifier: 

90 """A structure to specify a subset of labels to load 

91 

92 This structure may contain a set of labels to be used in subsetting a 

93 pipeline, or a beginning and end point. Beginning or end may be empty, 

94 in which case the range will be a half open interval. Unlike python 

95 iteration bounds, end bounds are *INCLUDED*. Note that range based 

96 selection is not well defined for pipelines that are not linear in nature, 

97 and correct behavior is not guaranteed, or may vary from run to run. 

98 """ 

99 

100 labels: Optional[Set[str]] = None 

101 begin: Optional[str] = None 

102 end: Optional[str] = None 

103 

104 def __post_init__(self) -> None: 

105 if self.labels is not None and (self.begin or self.end): 

106 raise ValueError( 

107 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

108 ) 

109 

110 

111class TaskDef: 

112 """TaskDef is a collection of information about task needed by Pipeline. 

113 

114 The information includes task name, configuration object and optional 

115 task class. This class is just a collection of attributes and it exposes 

116 all of them so that attributes could potentially be modified in place 

117 (e.g. if configuration needs extra overrides). 

118 

119 Attributes 

120 ---------- 

121 taskName : `str`, optional 

122 The fully-qualified `PipelineTask` class name. If not provided, 

123 ``taskClass`` must be. 

124 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional 

125 Instance of the configuration class corresponding to this task class, 

126 usually with all overrides applied. This config will be frozen. If 

127 not provided, ``taskClass`` must be provided and 

128 ``taskClass.ConfigClass()`` will be used. 

129 taskClass : `type`, optional 

130 `PipelineTask` class object; if provided and ``taskName`` is as well, 

131 the caller guarantees that they are consistent. If not provided, 

132 ``taskName`` is used to import the type. 

133 label : `str`, optional 

134 Task label, usually a short string unique in a pipeline. If not 

135 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

136 be used. 

137 """ 

138 

139 def __init__( 

140 self, 

141 taskName: Optional[str] = None, 

142 config: Optional[PipelineTaskConfig] = None, 

143 taskClass: Optional[Type[PipelineTask]] = None, 

144 label: Optional[str] = None, 

145 ): 

146 if taskName is None: 

147 if taskClass is None: 

148 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

149 taskName = get_full_type_name(taskClass) 

150 elif taskClass is None: 

151 taskClass = doImportType(taskName) 

152 if config is None: 

153 if taskClass is None: 

154 raise ValueError("`taskClass` must be provided if `config` is not.") 

155 config = taskClass.ConfigClass() 

156 if label is None: 

157 if taskClass is None: 

158 raise ValueError("`taskClass` must be provided if `label` is not.") 

159 label = taskClass._DefaultName 

160 self.taskName = taskName 

161 try: 

162 config.validate() 

163 except Exception: 

164 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

165 raise 

166 config.freeze() 

167 self.config = config 

168 self.taskClass = taskClass 

169 self.label = label 

170 self.connections = config.connections.ConnectionsClass(config=config) 

171 

172 @property 

173 def configDatasetName(self) -> str: 

174 """Name of a dataset type for configuration of this task (`str`)""" 

175 return self.label + "_config" 

176 

177 @property 

178 def metadataDatasetName(self) -> Optional[str]: 

179 """Name of a dataset type for metadata of this task, `None` if 

180 metadata is not to be saved (`str`) 

181 """ 

182 if self.config.saveMetadata: 

183 return self.makeMetadataDatasetName(self.label) 

184 else: 

185 return None 

186 

187 @classmethod 

188 def makeMetadataDatasetName(cls, label: str) -> str: 

189 """Construct the name of the dataset type for metadata for a task. 

190 

191 Parameters 

192 ---------- 

193 label : `str` 

194 Label for the task within its pipeline. 

195 

196 Returns 

197 ------- 

198 name : `str` 

199 Name of the task's metadata dataset type. 

200 """ 

201 return f"{label}_metadata" 

202 

203 @property 

204 def logOutputDatasetName(self) -> Optional[str]: 

205 """Name of a dataset type for log output from this task, `None` if 

206 logs are not to be saved (`str`) 

207 """ 

208 if cast(PipelineTaskConfig, self.config).saveLogOutput: 

209 return self.label + "_log" 

210 else: 

211 return None 

212 

213 def __str__(self) -> str: 

214 rep = "TaskDef(" + self.taskName 

215 if self.label: 

216 rep += ", label=" + self.label 

217 rep += ")" 

218 return rep 

219 

220 def __eq__(self, other: object) -> bool: 

221 if not isinstance(other, TaskDef): 

222 return False 

223 # This does not consider equality of configs when determining equality 

224 # as config equality is a difficult thing to define. Should be updated 

225 # after DM-27847 

226 return self.taskClass == other.taskClass and self.label == other.label 

227 

228 def __hash__(self) -> int: 

229 return hash((self.taskClass, self.label)) 

230 

231 @classmethod 

232 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef: 

233 """Custom callable for unpickling. 

234 

235 All arguments are forwarded directly to the constructor; this 

236 trampoline is only needed because ``__reduce__`` callables can't be 

237 called with keyword arguments. 

238 """ 

239 return cls(taskName=taskName, config=config, label=label) 

240 

241 def __reduce__(self) -> Tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], Tuple[str, Config, str]]: 

242 return (self._unreduce, (self.taskName, self.config, self.label)) 

243 

244 

245class Pipeline: 

246 """A `Pipeline` is a representation of a series of tasks to run, and the 

247 configuration for those tasks. 

248 

249 Parameters 

250 ---------- 

251 description : `str` 

252 A description of that this pipeline does. 

253 """ 

254 

255 def __init__(self, description: str): 

256 pipeline_dict = {"description": description, "tasks": {}} 

257 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

258 

259 @classmethod 

260 def fromFile(cls, filename: str) -> Pipeline: 

261 """Load a pipeline defined in a pipeline yaml file. 

262 

263 Parameters 

264 ---------- 

265 filename: `str` 

266 A path that points to a pipeline defined in yaml format. This 

267 filename may also supply additional labels to be used in 

268 subsetting the loaded Pipeline. These labels are separated from 

269 the path by a \\#, and may be specified as a comma separated 

270 list, or a range denoted as beginning..end. Beginning or end may 

271 be empty, in which case the range will be a half open interval. 

272 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

273 that range based selection is not well defined for pipelines that 

274 are not linear in nature, and correct behavior is not guaranteed, 

275 or may vary from run to run. 

276 

277 Returns 

278 ------- 

279 pipeline: `Pipeline` 

280 The pipeline loaded from specified location with appropriate (if 

281 any) subsetting 

282 

283 Notes 

284 ----- 

285 This method attempts to prune any contracts that contain labels which 

286 are not in the declared subset of labels. This pruning is done using a 

287 string based matching due to the nature of contracts and may prune more 

288 than it should. 

289 """ 

290 return cls.from_uri(filename) 

291 

292 @classmethod 

293 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

294 """Load a pipeline defined in a pipeline yaml file at a location 

295 specified by a URI. 

296 

297 Parameters 

298 ---------- 

299 uri: convertible to `ResourcePath` 

300 If a string is supplied this should be a URI path that points to a 

301 pipeline defined in yaml format, either as a direct path to the 

302 yaml file, or as a directory containing a "pipeline.yaml" file (the 

303 form used by `write_to_uri` with ``expand=True``). This uri may 

304 also supply additional labels to be used in subsetting the loaded 

305 Pipeline. These labels are separated from the path by a \\#, and 

306 may be specified as a comma separated list, or a range denoted as 

307 beginning..end. Beginning or end may be empty, in which case the 

308 range will be a half open interval. Unlike python iteration bounds, 

309 end bounds are *INCLUDED*. Note that range based selection is not 

310 well defined for pipelines that are not linear in nature, and 

311 correct behavior is not guaranteed, or may vary from run to run. 

312 The same specifiers can be used with a `ResourcePath` object, by 

313 being the sole contents in the fragments attribute. 

314 

315 Returns 

316 ------- 

317 pipeline: `Pipeline` 

318 The pipeline loaded from specified location with appropriate (if 

319 any) subsetting 

320 

321 Notes 

322 ----- 

323 This method attempts to prune any contracts that contain labels which 

324 are not in the declared subset of labels. This pruning is done using a 

325 string based matching due to the nature of contracts and may prune more 

326 than it should. 

327 """ 

328 # Split up the uri and any labels that were supplied 

329 uri, label_specifier = cls._parse_file_specifier(uri) 

330 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

331 

332 # If there are labels supplied, only keep those 

333 if label_specifier is not None: 

334 pipeline = pipeline.subsetFromLabels(label_specifier) 

335 return pipeline 

336 

337 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

338 """Subset a pipeline to contain only labels specified in labelSpecifier 

339 

340 Parameters 

341 ---------- 

342 labelSpecifier : `labelSpecifier` 

343 Object containing labels that describes how to subset a pipeline. 

344 

345 Returns 

346 ------- 

347 pipeline : `Pipeline` 

348 A new pipeline object that is a subset of the old pipeline 

349 

350 Raises 

351 ------ 

352 ValueError 

353 Raised if there is an issue with specified labels 

354 

355 Notes 

356 ----- 

357 This method attempts to prune any contracts that contain labels which 

358 are not in the declared subset of labels. This pruning is done using a 

359 string based matching due to the nature of contracts and may prune more 

360 than it should. 

361 """ 

362 # Labels supplied as a set 

363 if labelSpecifier.labels: 

364 labelSet = labelSpecifier.labels 

365 # Labels supplied as a range, first create a list of all the labels 

366 # in the pipeline sorted according to task dependency. Then only 

367 # keep labels that lie between the supplied bounds 

368 else: 

369 # Create a copy of the pipeline to use when assessing the label 

370 # ordering. Use a dict for fast searching while preserving order. 

371 # Remove contracts so they do not fail in the expansion step. This 

372 # is needed because a user may only configure the tasks they intend 

373 # to run, which may cause some contracts to fail if they will later 

374 # be dropped 

375 pipeline = copy.deepcopy(self) 

376 pipeline._pipelineIR.contracts = [] 

377 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

378 

379 # Verify the bounds are in the labels 

380 if labelSpecifier.begin is not None: 

381 if labelSpecifier.begin not in labels: 

382 raise ValueError( 

383 f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

384 "pipeline definition" 

385 ) 

386 if labelSpecifier.end is not None: 

387 if labelSpecifier.end not in labels: 

388 raise ValueError( 

389 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

390 ) 

391 

392 labelSet = set() 

393 for label in labels: 

394 if labelSpecifier.begin is not None: 

395 if label != labelSpecifier.begin: 

396 continue 

397 else: 

398 labelSpecifier.begin = None 

399 labelSet.add(label) 

400 if labelSpecifier.end is not None and label == labelSpecifier.end: 

401 break 

402 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

403 

404 @staticmethod 

405 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]: 

406 """Split appart a uri and any possible label subsets""" 

407 if isinstance(uri, str): 

408 # This is to support legacy pipelines during transition 

409 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

410 if num_replace: 

411 warnings.warn( 

412 f"The pipeline file {uri} seems to use the legacy : to separate " 

413 "labels, this is deprecated and will be removed after June 2021, please use " 

414 "# instead.", 

415 category=FutureWarning, 

416 ) 

417 if uri.count("#") > 1: 

418 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

419 # Everything else can be converted directly to ResourcePath. 

420 uri = ResourcePath(uri) 

421 label_subset = uri.fragment or None 

422 

423 specifier: Optional[LabelSpecifier] 

424 if label_subset is not None: 

425 label_subset = urllib.parse.unquote(label_subset) 

426 args: Dict[str, Union[Set[str], str, None]] 

427 # labels supplied as a list 

428 if "," in label_subset: 

429 if ".." in label_subset: 

430 raise ValueError( 

431 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

432 ) 

433 args = {"labels": set(label_subset.split(","))} 

434 # labels supplied as a range 

435 elif ".." in label_subset: 

436 # Try to de-structure the labelSubset, this will fail if more 

437 # than one range is specified 

438 begin, end, *rest = label_subset.split("..") 

439 if rest: 

440 raise ValueError("Only one range can be specified when loading a pipeline") 

441 args = {"begin": begin if begin else None, "end": end if end else None} 

442 # Assume anything else is a single label 

443 else: 

444 args = {"labels": {label_subset}} 

445 

446 # MyPy doesn't like how cavalier kwarg construction is with types. 

447 specifier = LabelSpecifier(**args) # type: ignore 

448 else: 

449 specifier = None 

450 

451 return uri, specifier 

452 

453 @classmethod 

454 def fromString(cls, pipeline_string: str) -> Pipeline: 

455 """Create a pipeline from string formatted as a pipeline document. 

456 

457 Parameters 

458 ---------- 

459 pipeline_string : `str` 

460 A string that is formatted according like a pipeline document 

461 

462 Returns 

463 ------- 

464 pipeline: `Pipeline` 

465 """ 

466 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

467 return pipeline 

468 

469 @classmethod 

470 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

471 """Create a pipeline from an already created `PipelineIR` object. 

472 

473 Parameters 

474 ---------- 

475 deserialized_pipeline: `PipelineIR` 

476 An already created pipeline intermediate representation object 

477 

478 Returns 

479 ------- 

480 pipeline: `Pipeline` 

481 """ 

482 pipeline = cls.__new__(cls) 

483 pipeline._pipelineIR = deserialized_pipeline 

484 return pipeline 

485 

486 @classmethod 

487 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

488 """Create a new pipeline by copying an already existing `Pipeline`. 

489 

490 Parameters 

491 ---------- 

492 pipeline: `Pipeline` 

493 An already created pipeline intermediate representation object 

494 

495 Returns 

496 ------- 

497 pipeline: `Pipeline` 

498 """ 

499 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

500 

501 def __str__(self) -> str: 

502 return str(self._pipelineIR) 

503 

504 def mergePipeline(self, pipeline: Pipeline) -> None: 

505 """Merge another in-memory `Pipeline` object into this one. 

506 

507 This merges another pipeline into this object, as if it were declared 

508 in the import block of the yaml definition of this pipeline. This 

509 modifies this pipeline in place. 

510 

511 Parameters 

512 ---------- 

513 pipeline : `Pipeline` 

514 The `Pipeline` object that is to be merged into this object. 

515 """ 

516 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,)) 

517 

518 def addLabelToSubset(self, subset: str, label: str) -> None: 

519 """Add a task label from the specified subset. 

520 

521 Parameters 

522 ---------- 

523 subset : `str` 

524 The labeled subset to modify 

525 label : `str` 

526 The task label to add to the specified subset. 

527 

528 Raises 

529 ------ 

530 ValueError 

531 Raised if the specified subset does not exist within the pipeline. 

532 Raised if the specified label does not exist within the pipeline. 

533 """ 

534 if label not in self._pipelineIR.tasks: 

535 raise ValueError(f"Label {label} does not appear within the pipeline") 

536 if subset not in self._pipelineIR.labeled_subsets: 

537 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

538 self._pipelineIR.labeled_subsets[subset].subset.add(label) 

539 

540 def removeLabelFromSubset(self, subset: str, label: str) -> None: 

541 """Remove a task label from the specified subset. 

542 

543 Parameters 

544 ---------- 

545 subset : `str` 

546 The labeled subset to modify 

547 label : `str` 

548 The task label to remove from the specified subset. 

549 

550 Raises 

551 ------ 

552 ValueError 

553 Raised if the specified subset does not exist in the pipeline. 

554 Raised if the specified label does not exist within the specified 

555 subset. 

556 """ 

557 if subset not in self._pipelineIR.labeled_subsets: 

558 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

559 if label not in self._pipelineIR.labeled_subsets[subset].subset: 

560 raise ValueError(f"Label {label} does not appear within the pipeline") 

561 self._pipelineIR.labeled_subsets[subset].subset.remove(label) 

562 

563 def findSubsetsWithLabel(self, label: str) -> set[str]: 

564 """Find any subsets which may contain the specified label. 

565 

566 This function returns the name of subsets which return the specified 

567 label. May return an empty set if there are no subsets, or no subsets 

568 containing the specified label. 

569 

570 Parameters 

571 ---------- 

572 label : `str` 

573 The task label to use in membership check 

574 

575 Returns 

576 ------- 

577 subsets : `set` of `str` 

578 Returns a set (possibly empty) of subsets names which contain the 

579 specified label. 

580 

581 Raises 

582 ------ 

583 ValueError 

584 Raised if the specified label does not exist within this pipeline. 

585 """ 

586 results = set() 

587 if label not in self._pipelineIR.tasks: 

588 raise ValueError(f"Label {label} does not appear within the pipeline") 

589 for subset in self._pipelineIR.labeled_subsets.values(): 

590 if label in subset.subset: 

591 results.add(subset.label) 

592 return results 

593 

594 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

595 """Add an instrument to the pipeline, or replace an instrument that is 

596 already defined. 

597 

598 Parameters 

599 ---------- 

600 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

601 Either a derived class object of a `lsst.daf.butler.instrument` or 

602 a string corresponding to a fully qualified 

603 `lsst.daf.butler.instrument` name. 

604 """ 

605 if isinstance(instrument, str): 

606 pass 

607 else: 

608 # TODO: assume that this is a subclass of Instrument, no type 

609 # checking 

610 instrument = get_full_type_name(instrument) 

611 self._pipelineIR.instrument = instrument 

612 

613 def getInstrument(self) -> Optional[str]: 

614 """Get the instrument from the pipeline. 

615 

616 Returns 

617 ------- 

618 instrument : `str`, or None 

619 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

620 name, or None if the pipeline does not have an instrument. 

621 """ 

622 return self._pipelineIR.instrument 

623 

624 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None: 

625 """Add a new task to the pipeline, or replace a task that is already 

626 associated with the supplied label. 

627 

628 Parameters 

629 ---------- 

630 task: `PipelineTask` or `str` 

631 Either a derived class object of a `PipelineTask` or a string 

632 corresponding to a fully qualified `PipelineTask` name. 

633 label: `str` 

634 A label that is used to identify the `PipelineTask` being added 

635 """ 

636 if isinstance(task, str): 

637 taskName = task 

638 elif issubclass(task, PipelineTask): 

639 taskName = get_full_type_name(task) 

640 else: 

641 raise ValueError( 

642 "task must be either a child class of PipelineTask or a string containing" 

643 " a fully qualified name to one" 

644 ) 

645 if not label: 

646 # in some cases (with command line-generated pipeline) tasks can 

647 # be defined without label which is not acceptable, use task 

648 # _DefaultName in that case 

649 if isinstance(task, str): 

650 task_class = doImportType(task) 

651 label = task_class._DefaultName 

652 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

653 

654 def removeTask(self, label: str) -> None: 

655 """Remove a task from the pipeline. 

656 

657 Parameters 

658 ---------- 

659 label : `str` 

660 The label used to identify the task that is to be removed 

661 

662 Raises 

663 ------ 

664 KeyError 

665 If no task with that label exists in the pipeline 

666 

667 """ 

668 self._pipelineIR.tasks.pop(label) 

669 

670 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

671 """Apply single config override. 

672 

673 Parameters 

674 ---------- 

675 label : `str` 

676 Label of the task. 

677 key: `str` 

678 Fully-qualified field name. 

679 value : object 

680 Value to be given to a field. 

681 """ 

682 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

683 

684 def addConfigFile(self, label: str, filename: str) -> None: 

685 """Add overrides from a specified file. 

686 

687 Parameters 

688 ---------- 

689 label : `str` 

690 The label used to identify the task associated with config to 

691 modify 

692 filename : `str` 

693 Path to the override file. 

694 """ 

695 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

696 

697 def addConfigPython(self, label: str, pythonString: str) -> None: 

698 """Add Overrides by running a snippet of python code against a config. 

699 

700 Parameters 

701 ---------- 

702 label : `str` 

703 The label used to identity the task associated with config to 

704 modify. 

705 pythonString: `str` 

706 A string which is valid python code to be executed. This is done 

707 with config as the only local accessible value. 

708 """ 

709 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

710 

711 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

712 if label == "parameters": 

713 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

714 raise ValueError("Cannot override parameters that are not defined in pipeline") 

715 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

716 if newConfig.file: 

717 raise ValueError("Setting parameters section with config file is not supported") 

718 if newConfig.python: 

719 raise ValueError("Setting parameters section using python block in unsupported") 

720 return 

721 if label not in self._pipelineIR.tasks: 

722 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

723 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

724 

725 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

726 """Write the pipeline to a file or directory. 

727 

728 Parameters 

729 ---------- 

730 uri : convertible to `ResourcePath` 

731 URI to write to; may have any scheme with `ResourcePath` write 

732 support or no scheme for a local file/directory. Should have a 

733 ``.yaml``. 

734 """ 

735 self._pipelineIR.write_to_uri(uri) 

736 

737 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

738 """Returns a generator of TaskDefs which can be used to create quantum 

739 graphs. 

740 

741 Returns 

742 ------- 

743 generator : generator of `TaskDef` 

744 The generator returned will be the sorted iterator of tasks which 

745 are to be used in constructing a quantum graph. 

746 

747 Raises 

748 ------ 

749 NotImplementedError 

750 If a dataId is supplied in a config block. This is in place for 

751 future use 

752 """ 

753 taskDefs = [] 

754 for label in self._pipelineIR.tasks: 

755 taskDefs.append(self._buildTaskDef(label)) 

756 

757 # lets evaluate the contracts 

758 if self._pipelineIR.contracts is not None: 

759 label_to_config = {x.label: x.config for x in taskDefs} 

760 for contract in self._pipelineIR.contracts: 

761 # execute this in its own line so it can raise a good error 

762 # message if there was problems with the eval 

763 success = eval(contract.contract, None, label_to_config) 

764 if not success: 

765 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

766 raise pipelineIR.ContractError( 

767 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

768 ) 

769 

770 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

771 yield from pipeTools.orderPipeline(taskDefs) 

772 

773 def _buildTaskDef(self, label: str) -> TaskDef: 

774 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

775 raise NameError(f"Label {label} does not appear in this pipeline") 

776 taskClass: Type[PipelineTask] = doImportType(taskIR.klass) 

777 taskName = get_full_type_name(taskClass) 

778 config = taskClass.ConfigClass() 

779 overrides = ConfigOverrides() 

780 if self._pipelineIR.instrument is not None: 

781 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

782 if taskIR.config is not None: 

783 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config): 

784 if configIR.dataId is not None: 

785 raise NotImplementedError( 

786 "Specializing a config on a partial data id is not yet " 

787 "supported in Pipeline definition" 

788 ) 

789 # only apply override if it applies to everything 

790 if configIR.dataId is None: 

791 if configIR.file: 

792 for configFile in configIR.file: 

793 overrides.addFileOverride(os.path.expandvars(configFile)) 

794 if configIR.python is not None: 

795 overrides.addPythonOverride(configIR.python) 

796 for key, value in configIR.rest.items(): 

797 overrides.addValueOverride(key, value) 

798 overrides.applyTo(config) 

799 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

800 

801 def __iter__(self) -> Generator[TaskDef, None, None]: 

802 return self.toExpandedPipeline() 

803 

804 def __getitem__(self, item: str) -> TaskDef: 

805 return self._buildTaskDef(item) 

806 

807 def __len__(self) -> int: 

808 return len(self._pipelineIR.tasks) 

809 

810 def __eq__(self, other: object) -> bool: 

811 if not isinstance(other, Pipeline): 

812 return False 

813 elif self._pipelineIR == other._pipelineIR: 

814 # Shortcut: if the IR is the same, the expanded pipeline must be 

815 # the same as well. But the converse is not true. 

816 return True 

817 else: 

818 self_expanded = {td.label: (td.taskClass,) for td in self} 

819 other_expanded = {td.label: (td.taskClass,) for td in other} 

820 if self_expanded != other_expanded: 

821 return False 

822 # After DM-27847, we should compare configuration here, or better, 

823 # delegated to TaskDef.__eq__ after making that compare configurations. 

824 raise NotImplementedError( 

825 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847." 

826 ) 

827 

828 

829@dataclass(frozen=True) 

830class TaskDatasetTypes: 

831 """An immutable struct that extracts and classifies the dataset types used 

832 by a `PipelineTask` 

833 """ 

834 

835 initInputs: NamedValueSet[DatasetType] 

836 """Dataset types that are needed as inputs in order to construct this Task. 

837 

838 Task-level `initInputs` may be classified as either 

839 `~PipelineDatasetTypes.initInputs` or 

840 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

841 """ 

842 

843 initOutputs: NamedValueSet[DatasetType] 

844 """Dataset types that may be written after constructing this Task. 

845 

846 Task-level `initOutputs` may be classified as either 

847 `~PipelineDatasetTypes.initOutputs` or 

848 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

849 """ 

850 

851 inputs: NamedValueSet[DatasetType] 

852 """Dataset types that are regular inputs to this Task. 

853 

854 If an input dataset needed for a Quantum cannot be found in the input 

855 collection(s) or produced by another Task in the Pipeline, that Quantum 

856 (and all dependent Quanta) will not be produced. 

857 

858 Task-level `inputs` may be classified as either 

859 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

860 at the Pipeline level. 

861 """ 

862 

863 prerequisites: NamedValueSet[DatasetType] 

864 """Dataset types that are prerequisite inputs to this Task. 

865 

866 Prerequisite inputs must exist in the input collection(s) before the 

867 pipeline is run, but do not constrain the graph - if a prerequisite is 

868 missing for a Quantum, `PrerequisiteMissingError` is raised. 

869 

870 Prerequisite inputs are not resolved until the second stage of 

871 QuantumGraph generation. 

872 """ 

873 

874 outputs: NamedValueSet[DatasetType] 

875 """Dataset types that are produced by this Task. 

876 

877 Task-level `outputs` may be classified as either 

878 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

879 at the Pipeline level. 

880 """ 

881 

882 @classmethod 

883 def fromTaskDef( 

884 cls, 

885 taskDef: TaskDef, 

886 *, 

887 registry: Registry, 

888 include_configs: bool = True, 

889 storage_class_mapping: Optional[Mapping[str, str]] = None, 

890 ) -> TaskDatasetTypes: 

891 """Extract and classify the dataset types from a single `PipelineTask`. 

892 

893 Parameters 

894 ---------- 

895 taskDef: `TaskDef` 

896 An instance of a `TaskDef` class for a particular `PipelineTask`. 

897 registry: `Registry` 

898 Registry used to construct normalized `DatasetType` objects and 

899 retrieve those that are incomplete. 

900 include_configs : `bool`, optional 

901 If `True` (default) include config dataset types as 

902 ``initOutputs``. 

903 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

904 If a taskdef contains a component dataset type that is unknown 

905 to the registry, its parent StorageClass will be looked up in this 

906 mapping if it is supplied. If the mapping does not contain the 

907 composite dataset type, or the mapping is not supplied an exception 

908 will be raised. 

909 

910 Returns 

911 ------- 

912 types: `TaskDatasetTypes` 

913 The dataset types used by this task. 

914 

915 Raises 

916 ------ 

917 ValueError 

918 Raised if dataset type connection definition differs from 

919 registry definition. 

920 LookupError 

921 Raised if component parent StorageClass could not be determined 

922 and storage_class_mapping does not contain the composite type, or 

923 is set to None. 

924 """ 

925 

926 def makeDatasetTypesSet( 

927 connectionType: str, 

928 is_input: bool, 

929 freeze: bool = True, 

930 ) -> NamedValueSet[DatasetType]: 

931 """Constructs a set of true `DatasetType` objects 

932 

933 Parameters 

934 ---------- 

935 connectionType : `str` 

936 Name of the connection type to produce a set for, corresponds 

937 to an attribute of type `list` on the connection class instance 

938 is_input : `bool` 

939 These are input dataset types, else they are output dataset 

940 types. 

941 freeze : `bool`, optional 

942 If `True`, call `NamedValueSet.freeze` on the object returned. 

943 

944 Returns 

945 ------- 

946 datasetTypes : `NamedValueSet` 

947 A set of all datasetTypes which correspond to the input 

948 connection type specified in the connection class of this 

949 `PipelineTask` 

950 

951 Raises 

952 ------ 

953 ValueError 

954 Raised if dataset type connection definition differs from 

955 registry definition. 

956 LookupError 

957 Raised if component parent StorageClass could not be determined 

958 and storage_class_mapping does not contain the composite type, 

959 or is set to None. 

960 

961 Notes 

962 ----- 

963 This function is a closure over the variables ``registry`` and 

964 ``taskDef``, and ``storage_class_mapping``. 

965 """ 

966 datasetTypes = NamedValueSet[DatasetType]() 

967 for c in iterConnections(taskDef.connections, connectionType): 

968 dimensions = set(getattr(c, "dimensions", set())) 

969 if "skypix" in dimensions: 

970 try: 

971 datasetType = registry.getDatasetType(c.name) 

972 except LookupError as err: 

973 raise LookupError( 

974 f"DatasetType '{c.name}' referenced by " 

975 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

976 f"placeholder, but does not already exist in the registry. " 

977 f"Note that reference catalog names are now used as the dataset " 

978 f"type name instead of 'ref_cat'." 

979 ) from err 

980 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

981 rest2 = set( 

982 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

983 ) 

984 if rest1 != rest2: 

985 raise ValueError( 

986 f"Non-skypix dimensions for dataset type {c.name} declared in " 

987 f"connections ({rest1}) are inconsistent with those in " 

988 f"registry's version of this dataset ({rest2})." 

989 ) 

990 else: 

991 # Component dataset types are not explicitly in the 

992 # registry. This complicates consistency checks with 

993 # registry and requires we work out the composite storage 

994 # class. 

995 registryDatasetType = None 

996 try: 

997 registryDatasetType = registry.getDatasetType(c.name) 

998 except KeyError: 

999 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

1000 if componentName: 

1001 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

1002 raise LookupError( 

1003 "Component parent class cannot be determined, and " 

1004 "composite name was not in storage class mapping, or no " 

1005 "storage_class_mapping was supplied" 

1006 ) 

1007 else: 

1008 parentStorageClass = storage_class_mapping[compositeName] 

1009 else: 

1010 parentStorageClass = None 

1011 datasetType = c.makeDatasetType( 

1012 registry.dimensions, parentStorageClass=parentStorageClass 

1013 ) 

1014 registryDatasetType = datasetType 

1015 else: 

1016 datasetType = c.makeDatasetType( 

1017 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

1018 ) 

1019 

1020 if registryDatasetType and datasetType != registryDatasetType: 

1021 # The dataset types differ but first check to see if 

1022 # they are compatible before raising. 

1023 if is_input: 

1024 # This DatasetType must be compatible on get. 

1025 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

1026 else: 

1027 # Has to be able to be converted to expect type 

1028 # on put. 

1029 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

1030 if is_compatible: 

1031 # For inputs we want the pipeline to use the 

1032 # pipeline definition, for outputs it should use 

1033 # the registry definition. 

1034 if not is_input: 

1035 datasetType = registryDatasetType 

1036 _LOG.debug( 

1037 "Dataset types differ (task %s != registry %s) but are compatible" 

1038 " for %s in %s.", 

1039 datasetType, 

1040 registryDatasetType, 

1041 "input" if is_input else "output", 

1042 taskDef.label, 

1043 ) 

1044 else: 

1045 try: 

1046 # Explicitly check for storage class just to 

1047 # make more specific message. 

1048 _ = datasetType.storageClass 

1049 except KeyError: 

1050 raise ValueError( 

1051 "Storage class does not exist for supplied dataset type " 

1052 f"{datasetType} for {taskDef.label}." 

1053 ) from None 

1054 raise ValueError( 

1055 f"Supplied dataset type ({datasetType}) inconsistent with " 

1056 f"registry definition ({registryDatasetType}) " 

1057 f"for {taskDef.label}." 

1058 ) 

1059 datasetTypes.add(datasetType) 

1060 if freeze: 

1061 datasetTypes.freeze() 

1062 return datasetTypes 

1063 

1064 # optionally add initOutput dataset for config 

1065 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

1066 if include_configs: 

1067 initOutputs.add( 

1068 DatasetType( 

1069 taskDef.configDatasetName, 

1070 registry.dimensions.empty, 

1071 storageClass="Config", 

1072 ) 

1073 ) 

1074 initOutputs.freeze() 

1075 

1076 # optionally add output dataset for metadata 

1077 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

1078 if taskDef.metadataDatasetName is not None: 

1079 # Metadata is supposed to be of the TaskMetadata type, its 

1080 # dimensions correspond to a task quantum. 

1081 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1082 

1083 # Allow the storage class definition to be read from the existing 

1084 # dataset type definition if present. 

1085 try: 

1086 current = registry.getDatasetType(taskDef.metadataDatasetName) 

1087 except KeyError: 

1088 # No previous definition so use the default. 

1089 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

1090 else: 

1091 storageClass = current.storageClass.name 

1092 

1093 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

1094 if taskDef.logOutputDatasetName is not None: 

1095 # Log output dimensions correspond to a task quantum. 

1096 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1097 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}) 

1098 

1099 outputs.freeze() 

1100 

1101 return cls( 

1102 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

1103 initOutputs=initOutputs, 

1104 inputs=makeDatasetTypesSet("inputs", is_input=True), 

1105 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

1106 outputs=outputs, 

1107 ) 

1108 

1109 

1110@dataclass(frozen=True) 

1111class PipelineDatasetTypes: 

1112 """An immutable struct that classifies the dataset types used in a 

1113 `Pipeline`. 

1114 """ 

1115 

1116 packagesDatasetName: ClassVar[str] = "packages" 

1117 """Name of a dataset type used to save package versions. 

1118 """ 

1119 

1120 initInputs: NamedValueSet[DatasetType] 

1121 """Dataset types that are needed as inputs in order to construct the Tasks 

1122 in this Pipeline. 

1123 

1124 This does not include dataset types that are produced when constructing 

1125 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1126 """ 

1127 

1128 initOutputs: NamedValueSet[DatasetType] 

1129 """Dataset types that may be written after constructing the Tasks in this 

1130 Pipeline. 

1131 

1132 This does not include dataset types that are also used as inputs when 

1133 constructing other Tasks in the Pipeline (these are classified as 

1134 `initIntermediates`). 

1135 """ 

1136 

1137 initIntermediates: NamedValueSet[DatasetType] 

1138 """Dataset types that are both used when constructing one or more Tasks 

1139 in the Pipeline and produced as a side-effect of constructing another 

1140 Task in the Pipeline. 

1141 """ 

1142 

1143 inputs: NamedValueSet[DatasetType] 

1144 """Dataset types that are regular inputs for the full pipeline. 

1145 

1146 If an input dataset needed for a Quantum cannot be found in the input 

1147 collection(s), that Quantum (and all dependent Quanta) will not be 

1148 produced. 

1149 """ 

1150 

1151 prerequisites: NamedValueSet[DatasetType] 

1152 """Dataset types that are prerequisite inputs for the full Pipeline. 

1153 

1154 Prerequisite inputs must exist in the input collection(s) before the 

1155 pipeline is run, but do not constrain the graph - if a prerequisite is 

1156 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1157 

1158 Prerequisite inputs are not resolved until the second stage of 

1159 QuantumGraph generation. 

1160 """ 

1161 

1162 intermediates: NamedValueSet[DatasetType] 

1163 """Dataset types that are output by one Task in the Pipeline and consumed 

1164 as inputs by one or more other Tasks in the Pipeline. 

1165 """ 

1166 

1167 outputs: NamedValueSet[DatasetType] 

1168 """Dataset types that are output by a Task in the Pipeline and not consumed 

1169 by any other Task in the Pipeline. 

1170 """ 

1171 

1172 byTask: Mapping[str, TaskDatasetTypes] 

1173 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1174 

1175 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1176 neither has been modified since the dataset types were extracted, of 

1177 course). 

1178 """ 

1179 

1180 @classmethod 

1181 def fromPipeline( 

1182 cls, 

1183 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1184 *, 

1185 registry: Registry, 

1186 include_configs: bool = True, 

1187 include_packages: bool = True, 

1188 ) -> PipelineDatasetTypes: 

1189 """Extract and classify the dataset types from all tasks in a 

1190 `Pipeline`. 

1191 

1192 Parameters 

1193 ---------- 

1194 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1195 A collection of tasks that can be run together. 

1196 registry: `Registry` 

1197 Registry used to construct normalized `DatasetType` objects and 

1198 retrieve those that are incomplete. 

1199 include_configs : `bool`, optional 

1200 If `True` (default) include config dataset types as 

1201 ``initOutputs``. 

1202 include_packages : `bool`, optional 

1203 If `True` (default) include the dataset type for software package 

1204 versions in ``initOutputs``. 

1205 

1206 Returns 

1207 ------- 

1208 types: `PipelineDatasetTypes` 

1209 The dataset types used by this `Pipeline`. 

1210 

1211 Raises 

1212 ------ 

1213 ValueError 

1214 Raised if Tasks are inconsistent about which datasets are marked 

1215 prerequisite. This indicates that the Tasks cannot be run as part 

1216 of the same `Pipeline`. 

1217 """ 

1218 allInputs = NamedValueSet[DatasetType]() 

1219 allOutputs = NamedValueSet[DatasetType]() 

1220 allInitInputs = NamedValueSet[DatasetType]() 

1221 allInitOutputs = NamedValueSet[DatasetType]() 

1222 prerequisites = NamedValueSet[DatasetType]() 

1223 byTask = dict() 

1224 if include_packages: 

1225 allInitOutputs.add( 

1226 DatasetType( 

1227 cls.packagesDatasetName, 

1228 registry.dimensions.empty, 

1229 storageClass="Packages", 

1230 ) 

1231 ) 

1232 # create a list of TaskDefs in case the input is a generator 

1233 pipeline = list(pipeline) 

1234 

1235 # collect all the output dataset types 

1236 typeStorageclassMap: Dict[str, str] = {} 

1237 for taskDef in pipeline: 

1238 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1239 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1240 

1241 for taskDef in pipeline: 

1242 thisTask = TaskDatasetTypes.fromTaskDef( 

1243 taskDef, 

1244 registry=registry, 

1245 include_configs=include_configs, 

1246 storage_class_mapping=typeStorageclassMap, 

1247 ) 

1248 allInitInputs.update(thisTask.initInputs) 

1249 allInitOutputs.update(thisTask.initOutputs) 

1250 allInputs.update(thisTask.inputs) 

1251 prerequisites.update(thisTask.prerequisites) 

1252 allOutputs.update(thisTask.outputs) 

1253 byTask[taskDef.label] = thisTask 

1254 if not prerequisites.isdisjoint(allInputs): 

1255 raise ValueError( 

1256 "{} marked as both prerequisites and regular inputs".format( 

1257 {dt.name for dt in allInputs & prerequisites} 

1258 ) 

1259 ) 

1260 if not prerequisites.isdisjoint(allOutputs): 

1261 raise ValueError( 

1262 "{} marked as both prerequisites and outputs".format( 

1263 {dt.name for dt in allOutputs & prerequisites} 

1264 ) 

1265 ) 

1266 # Make sure that components which are marked as inputs get treated as 

1267 # intermediates if there is an output which produces the composite 

1268 # containing the component 

1269 intermediateComponents = NamedValueSet[DatasetType]() 

1270 intermediateComposites = NamedValueSet[DatasetType]() 

1271 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1272 for dsType in allInputs: 

1273 # get the name of a possible component 

1274 name, component = dsType.nameAndComponent() 

1275 # if there is a component name, that means this is a component 

1276 # DatasetType, if there is an output which produces the parent of 

1277 # this component, treat this input as an intermediate 

1278 if component is not None: 

1279 # This needs to be in this if block, because someone might have 

1280 # a composite that is a pure input from existing data 

1281 if name in outputNameMapping: 

1282 intermediateComponents.add(dsType) 

1283 intermediateComposites.add(outputNameMapping[name]) 

1284 

1285 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1286 common = a.names & b.names 

1287 for name in common: 

1288 # Any compatibility is allowed. This function does not know 

1289 # if a dataset type is to be used for input or output. 

1290 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1291 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1292 

1293 checkConsistency(allInitInputs, allInitOutputs) 

1294 checkConsistency(allInputs, allOutputs) 

1295 checkConsistency(allInputs, intermediateComposites) 

1296 checkConsistency(allOutputs, intermediateComposites) 

1297 

1298 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]: 

1299 assert isinstance(s, NamedValueSet) 

1300 s.freeze() 

1301 return s 

1302 

1303 return cls( 

1304 initInputs=frozen(allInitInputs - allInitOutputs), 

1305 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1306 initOutputs=frozen(allInitOutputs - allInitInputs), 

1307 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1308 # If there are storage class differences in inputs and outputs 

1309 # the intermediates have to choose priority. Here choose that 

1310 # inputs to tasks much match the requested storage class by 

1311 # applying the inputs over the top of the outputs. 

1312 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1313 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1314 prerequisites=frozen(prerequisites), 

1315 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1316 ) 

1317 

1318 @classmethod 

1319 def initOutputNames( 

1320 cls, 

1321 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1322 *, 

1323 include_configs: bool = True, 

1324 include_packages: bool = True, 

1325 ) -> Iterator[str]: 

1326 """Return the names of dataset types ot task initOutputs, Configs, 

1327 and package versions for a pipeline. 

1328 

1329 Parameters 

1330 ---------- 

1331 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1332 A `Pipeline` instance or collection of `TaskDef` instances. 

1333 include_configs : `bool`, optional 

1334 If `True` (default) include config dataset types. 

1335 include_packages : `bool`, optional 

1336 If `True` (default) include the dataset type for package versions. 

1337 

1338 Yields 

1339 ------ 

1340 datasetTypeName : `str` 

1341 Name of the dataset type. 

1342 """ 

1343 if include_packages: 

1344 # Package versions dataset type 

1345 yield cls.packagesDatasetName 

1346 

1347 if isinstance(pipeline, Pipeline): 

1348 pipeline = pipeline.toExpandedPipeline() 

1349 

1350 for taskDef in pipeline: 

1351 

1352 # all task InitOutputs 

1353 for name in taskDef.connections.initOutputs: 

1354 attribute = getattr(taskDef.connections, name) 

1355 yield attribute.name 

1356 

1357 # config dataset name 

1358 if include_configs: 

1359 yield taskDef.configDatasetName