Coverage for python/lsst/pipe/base/pipeline.py: 19%

442 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-08 02:43 -0800

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import os 

31import re 

32import urllib.parse 

33import warnings 

34 

35# ------------------------------- 

36# Imports of standard modules -- 

37# ------------------------------- 

38from dataclasses import dataclass 

39from types import MappingProxyType 

40from typing import ( 

41 TYPE_CHECKING, 

42 AbstractSet, 

43 Callable, 

44 ClassVar, 

45 Dict, 

46 Generator, 

47 Iterable, 

48 Iterator, 

49 Mapping, 

50 Optional, 

51 Set, 

52 Tuple, 

53 Type, 

54 Union, 

55 cast, 

56) 

57 

58# ----------------------------- 

59# Imports for other modules -- 

60from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension 

61from lsst.resources import ResourcePath, ResourcePathExpression 

62from lsst.utils import doImportType 

63from lsst.utils.introspection import get_full_type_name 

64 

65from . import pipelineIR, pipeTools 

66from ._task_metadata import TaskMetadata 

67from .config import PipelineTaskConfig 

68from .configOverrides import ConfigOverrides 

69from .connections import iterConnections 

70from .pipelineTask import PipelineTask 

71from .task import _TASK_METADATA_TYPE 

72 

73if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true

74 from lsst.obs.base import Instrument 

75 from lsst.pex.config import Config 

76 

77# ---------------------------------- 

78# Local non-exported definitions -- 

79# ---------------------------------- 

80 

81_LOG = logging.getLogger(__name__) 

82 

83# ------------------------ 

84# Exported definitions -- 

85# ------------------------ 

86 

87 

88@dataclass 

89class LabelSpecifier: 

90 """A structure to specify a subset of labels to load 

91 

92 This structure may contain a set of labels to be used in subsetting a 

93 pipeline, or a beginning and end point. Beginning or end may be empty, 

94 in which case the range will be a half open interval. Unlike python 

95 iteration bounds, end bounds are *INCLUDED*. Note that range based 

96 selection is not well defined for pipelines that are not linear in nature, 

97 and correct behavior is not guaranteed, or may vary from run to run. 

98 """ 

99 

100 labels: Optional[Set[str]] = None 

101 begin: Optional[str] = None 

102 end: Optional[str] = None 

103 

104 def __post_init__(self) -> None: 

105 if self.labels is not None and (self.begin or self.end): 

106 raise ValueError( 

107 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

108 ) 

109 

110 

111class TaskDef: 

112 """TaskDef is a collection of information about task needed by Pipeline. 

113 

114 The information includes task name, configuration object and optional 

115 task class. This class is just a collection of attributes and it exposes 

116 all of them so that attributes could potentially be modified in place 

117 (e.g. if configuration needs extra overrides). 

118 

119 Attributes 

120 ---------- 

121 taskName : `str`, optional 

122 The fully-qualified `PipelineTask` class name. If not provided, 

123 ``taskClass`` must be. 

124 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional 

125 Instance of the configuration class corresponding to this task class, 

126 usually with all overrides applied. This config will be frozen. If 

127 not provided, ``taskClass`` must be provided and 

128 ``taskClass.ConfigClass()`` will be used. 

129 taskClass : `type`, optional 

130 `PipelineTask` class object; if provided and ``taskName`` is as well, 

131 the caller guarantees that they are consistent. If not provided, 

132 ``taskName`` is used to import the type. 

133 label : `str`, optional 

134 Task label, usually a short string unique in a pipeline. If not 

135 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

136 be used. 

137 """ 

138 

139 def __init__( 

140 self, 

141 taskName: Optional[str] = None, 

142 config: Optional[PipelineTaskConfig] = None, 

143 taskClass: Optional[Type[PipelineTask]] = None, 

144 label: Optional[str] = None, 

145 ): 

146 if taskName is None: 

147 if taskClass is None: 

148 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

149 taskName = get_full_type_name(taskClass) 

150 elif taskClass is None: 

151 taskClass = doImportType(taskName) 

152 if config is None: 

153 if taskClass is None: 

154 raise ValueError("`taskClass` must be provided if `config` is not.") 

155 config = taskClass.ConfigClass() 

156 if label is None: 

157 if taskClass is None: 

158 raise ValueError("`taskClass` must be provided if `label` is not.") 

159 label = taskClass._DefaultName 

160 self.taskName = taskName 

161 try: 

162 config.validate() 

163 except Exception: 

164 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

165 raise 

166 config.freeze() 

167 self.config = config 

168 self.taskClass = taskClass 

169 self.label = label 

170 self.connections = config.connections.ConnectionsClass(config=config) 

171 

172 @property 

173 def configDatasetName(self) -> str: 

174 """Name of a dataset type for configuration of this task (`str`)""" 

175 return self.label + "_config" 

176 

177 @property 

178 def metadataDatasetName(self) -> Optional[str]: 

179 """Name of a dataset type for metadata of this task, `None` if 

180 metadata is not to be saved (`str`) 

181 """ 

182 if self.config.saveMetadata: 

183 return self.makeMetadataDatasetName(self.label) 

184 else: 

185 return None 

186 

187 @classmethod 

188 def makeMetadataDatasetName(cls, label: str) -> str: 

189 """Construct the name of the dataset type for metadata for a task. 

190 

191 Parameters 

192 ---------- 

193 label : `str` 

194 Label for the task within its pipeline. 

195 

196 Returns 

197 ------- 

198 name : `str` 

199 Name of the task's metadata dataset type. 

200 """ 

201 return f"{label}_metadata" 

202 

203 @property 

204 def logOutputDatasetName(self) -> Optional[str]: 

205 """Name of a dataset type for log output from this task, `None` if 

206 logs are not to be saved (`str`) 

207 """ 

208 if cast(PipelineTaskConfig, self.config).saveLogOutput: 

209 return self.label + "_log" 

210 else: 

211 return None 

212 

213 def __str__(self) -> str: 

214 rep = "TaskDef(" + self.taskName 

215 if self.label: 

216 rep += ", label=" + self.label 

217 rep += ")" 

218 return rep 

219 

220 def __eq__(self, other: object) -> bool: 

221 if not isinstance(other, TaskDef): 

222 return False 

223 # This does not consider equality of configs when determining equality 

224 # as config equality is a difficult thing to define. Should be updated 

225 # after DM-27847 

226 return self.taskClass == other.taskClass and self.label == other.label 

227 

228 def __hash__(self) -> int: 

229 return hash((self.taskClass, self.label)) 

230 

231 @classmethod 

232 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef: 

233 """Custom callable for unpickling. 

234 

235 All arguments are forwarded directly to the constructor; this 

236 trampoline is only needed because ``__reduce__`` callables can't be 

237 called with keyword arguments. 

238 """ 

239 return cls(taskName=taskName, config=config, label=label) 

240 

241 def __reduce__(self) -> Tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], Tuple[str, Config, str]]: 

242 return (self._unreduce, (self.taskName, self.config, self.label)) 

243 

244 

245class Pipeline: 

246 """A `Pipeline` is a representation of a series of tasks to run, and the 

247 configuration for those tasks. 

248 

249 Parameters 

250 ---------- 

251 description : `str` 

252 A description of that this pipeline does. 

253 """ 

254 

255 def __init__(self, description: str): 

256 pipeline_dict = {"description": description, "tasks": {}} 

257 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

258 

259 @classmethod 

260 def fromFile(cls, filename: str) -> Pipeline: 

261 """Load a pipeline defined in a pipeline yaml file. 

262 

263 Parameters 

264 ---------- 

265 filename: `str` 

266 A path that points to a pipeline defined in yaml format. This 

267 filename may also supply additional labels to be used in 

268 subsetting the loaded Pipeline. These labels are separated from 

269 the path by a \\#, and may be specified as a comma separated 

270 list, or a range denoted as beginning..end. Beginning or end may 

271 be empty, in which case the range will be a half open interval. 

272 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

273 that range based selection is not well defined for pipelines that 

274 are not linear in nature, and correct behavior is not guaranteed, 

275 or may vary from run to run. 

276 

277 Returns 

278 ------- 

279 pipeline: `Pipeline` 

280 The pipeline loaded from specified location with appropriate (if 

281 any) subsetting 

282 

283 Notes 

284 ----- 

285 This method attempts to prune any contracts that contain labels which 

286 are not in the declared subset of labels. This pruning is done using a 

287 string based matching due to the nature of contracts and may prune more 

288 than it should. 

289 """ 

290 return cls.from_uri(filename) 

291 

292 @classmethod 

293 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

294 """Load a pipeline defined in a pipeline yaml file at a location 

295 specified by a URI. 

296 

297 Parameters 

298 ---------- 

299 uri: convertible to `ResourcePath` 

300 If a string is supplied this should be a URI path that points to a 

301 pipeline defined in yaml format, either as a direct path to the 

302 yaml file, or as a directory containing a "pipeline.yaml" file (the 

303 form used by `write_to_uri` with ``expand=True``). This uri may 

304 also supply additional labels to be used in subsetting the loaded 

305 Pipeline. These labels are separated from the path by a \\#, and 

306 may be specified as a comma separated list, or a range denoted as 

307 beginning..end. Beginning or end may be empty, in which case the 

308 range will be a half open interval. Unlike python iteration bounds, 

309 end bounds are *INCLUDED*. Note that range based selection is not 

310 well defined for pipelines that are not linear in nature, and 

311 correct behavior is not guaranteed, or may vary from run to run. 

312 The same specifiers can be used with a `ResourcePath` object, by 

313 being the sole contents in the fragments attribute. 

314 

315 Returns 

316 ------- 

317 pipeline: `Pipeline` 

318 The pipeline loaded from specified location with appropriate (if 

319 any) subsetting 

320 

321 Notes 

322 ----- 

323 This method attempts to prune any contracts that contain labels which 

324 are not in the declared subset of labels. This pruning is done using a 

325 string based matching due to the nature of contracts and may prune more 

326 than it should. 

327 """ 

328 # Split up the uri and any labels that were supplied 

329 uri, label_specifier = cls._parse_file_specifier(uri) 

330 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

331 

332 # If there are labels supplied, only keep those 

333 if label_specifier is not None: 

334 pipeline = pipeline.subsetFromLabels(label_specifier) 

335 return pipeline 

336 

337 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

338 """Subset a pipeline to contain only labels specified in labelSpecifier 

339 

340 Parameters 

341 ---------- 

342 labelSpecifier : `labelSpecifier` 

343 Object containing labels that describes how to subset a pipeline. 

344 

345 Returns 

346 ------- 

347 pipeline : `Pipeline` 

348 A new pipeline object that is a subset of the old pipeline 

349 

350 Raises 

351 ------ 

352 ValueError 

353 Raised if there is an issue with specified labels 

354 

355 Notes 

356 ----- 

357 This method attempts to prune any contracts that contain labels which 

358 are not in the declared subset of labels. This pruning is done using a 

359 string based matching due to the nature of contracts and may prune more 

360 than it should. 

361 """ 

362 # Labels supplied as a set 

363 if labelSpecifier.labels: 

364 labelSet = labelSpecifier.labels 

365 # Labels supplied as a range, first create a list of all the labels 

366 # in the pipeline sorted according to task dependency. Then only 

367 # keep labels that lie between the supplied bounds 

368 else: 

369 # Create a copy of the pipeline to use when assessing the label 

370 # ordering. Use a dict for fast searching while preserving order. 

371 # Remove contracts so they do not fail in the expansion step. This 

372 # is needed because a user may only configure the tasks they intend 

373 # to run, which may cause some contracts to fail if they will later 

374 # be dropped 

375 pipeline = copy.deepcopy(self) 

376 pipeline._pipelineIR.contracts = [] 

377 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

378 

379 # Verify the bounds are in the labels 

380 if labelSpecifier.begin is not None: 

381 if labelSpecifier.begin not in labels: 

382 raise ValueError( 

383 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition" 

384 ) 

385 if labelSpecifier.end is not None: 

386 if labelSpecifier.end not in labels: 

387 raise ValueError( 

388 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

389 ) 

390 

391 labelSet = set() 

392 for label in labels: 

393 if labelSpecifier.begin is not None: 

394 if label != labelSpecifier.begin: 

395 continue 

396 else: 

397 labelSpecifier.begin = None 

398 labelSet.add(label) 

399 if labelSpecifier.end is not None and label == labelSpecifier.end: 

400 break 

401 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

402 

403 @staticmethod 

404 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]: 

405 """Split appart a uri and any possible label subsets""" 

406 if isinstance(uri, str): 

407 # This is to support legacy pipelines during transition 

408 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

409 if num_replace: 

410 warnings.warn( 

411 f"The pipeline file {uri} seems to use the legacy : to separate " 

412 "labels, this is deprecated and will be removed after June 2021, please use " 

413 "# instead.", 

414 category=FutureWarning, 

415 ) 

416 if uri.count("#") > 1: 

417 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

418 # Everything else can be converted directly to ResourcePath. 

419 uri = ResourcePath(uri) 

420 label_subset = uri.fragment or None 

421 

422 specifier: Optional[LabelSpecifier] 

423 if label_subset is not None: 

424 label_subset = urllib.parse.unquote(label_subset) 

425 args: Dict[str, Union[Set[str], str, None]] 

426 # labels supplied as a list 

427 if "," in label_subset: 

428 if ".." in label_subset: 

429 raise ValueError( 

430 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

431 ) 

432 args = {"labels": set(label_subset.split(","))} 

433 # labels supplied as a range 

434 elif ".." in label_subset: 

435 # Try to de-structure the labelSubset, this will fail if more 

436 # than one range is specified 

437 begin, end, *rest = label_subset.split("..") 

438 if rest: 

439 raise ValueError("Only one range can be specified when loading a pipeline") 

440 args = {"begin": begin if begin else None, "end": end if end else None} 

441 # Assume anything else is a single label 

442 else: 

443 args = {"labels": {label_subset}} 

444 

445 # MyPy doesn't like how cavalier kwarg construction is with types. 

446 specifier = LabelSpecifier(**args) # type: ignore 

447 else: 

448 specifier = None 

449 

450 return uri, specifier 

451 

452 @classmethod 

453 def fromString(cls, pipeline_string: str) -> Pipeline: 

454 """Create a pipeline from string formatted as a pipeline document. 

455 

456 Parameters 

457 ---------- 

458 pipeline_string : `str` 

459 A string that is formatted according like a pipeline document 

460 

461 Returns 

462 ------- 

463 pipeline: `Pipeline` 

464 """ 

465 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

466 return pipeline 

467 

468 @classmethod 

469 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

470 """Create a pipeline from an already created `PipelineIR` object. 

471 

472 Parameters 

473 ---------- 

474 deserialized_pipeline: `PipelineIR` 

475 An already created pipeline intermediate representation object 

476 

477 Returns 

478 ------- 

479 pipeline: `Pipeline` 

480 """ 

481 pipeline = cls.__new__(cls) 

482 pipeline._pipelineIR = deserialized_pipeline 

483 return pipeline 

484 

485 @classmethod 

486 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

487 """Create a new pipeline by copying an already existing `Pipeline`. 

488 

489 Parameters 

490 ---------- 

491 pipeline: `Pipeline` 

492 An already created pipeline intermediate representation object 

493 

494 Returns 

495 ------- 

496 pipeline: `Pipeline` 

497 """ 

498 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

499 

500 def __str__(self) -> str: 

501 return str(self._pipelineIR) 

502 

503 def mergePipeline(self, pipeline: Pipeline) -> None: 

504 """Merge another in-memory `Pipeline` object into this one. 

505 

506 This merges another pipeline into this object, as if it were declared 

507 in the import block of the yaml definition of this pipeline. This 

508 modifies this pipeline in place. 

509 

510 Parameters 

511 ---------- 

512 pipeline : `Pipeline` 

513 The `Pipeline` object that is to be merged into this object. 

514 """ 

515 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,)) 

516 

517 def addLabelToSubset(self, subset: str, label: str) -> None: 

518 """Add a task label from the specified subset. 

519 

520 Parameters 

521 ---------- 

522 subset : `str` 

523 The labeled subset to modify 

524 label : `str` 

525 The task label to add to the specified subset. 

526 

527 Raises 

528 ------ 

529 ValueError 

530 Raised if the specified subset does not exist within the pipeline. 

531 Raised if the specified label does not exist within the pipeline. 

532 """ 

533 if label not in self._pipelineIR.tasks: 

534 raise ValueError(f"Label {label} does not appear within the pipeline") 

535 if subset not in self._pipelineIR.labeled_subsets: 

536 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

537 self._pipelineIR.labeled_subsets[subset].subset.add(label) 

538 

539 def removeLabelFromSubset(self, subset: str, label: str) -> None: 

540 """Remove a task label from the specified subset. 

541 

542 Parameters 

543 ---------- 

544 subset : `str` 

545 The labeled subset to modify 

546 label : `str` 

547 The task label to remove from the specified subset. 

548 

549 Raises 

550 ------ 

551 ValueError 

552 Raised if the specified subset does not exist in the pipeline. 

553 Raised if the specified label does not exist within the specified 

554 subset. 

555 """ 

556 if subset not in self._pipelineIR.labeled_subsets: 

557 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

558 if label not in self._pipelineIR.labeled_subsets[subset].subset: 

559 raise ValueError(f"Label {label} does not appear within the pipeline") 

560 self._pipelineIR.labeled_subsets[subset].subset.remove(label) 

561 

562 def findSubsetsWithLabel(self, label: str) -> set[str]: 

563 """Find any subsets which may contain the specified label. 

564 

565 This function returns the name of subsets which return the specified 

566 label. May return an empty set if there are no subsets, or no subsets 

567 containing the specified label. 

568 

569 Parameters 

570 ---------- 

571 label : `str` 

572 The task label to use in membership check 

573 

574 Returns 

575 ------- 

576 subsets : `set` of `str` 

577 Returns a set (possibly empty) of subsets names which contain the 

578 specified label. 

579 

580 Raises 

581 ------ 

582 ValueError 

583 Raised if the specified label does not exist within this pipeline. 

584 """ 

585 results = set() 

586 if label not in self._pipelineIR.tasks: 

587 raise ValueError(f"Label {label} does not appear within the pipeline") 

588 for subset in self._pipelineIR.labeled_subsets.values(): 

589 if label in subset.subset: 

590 results.add(subset.label) 

591 return results 

592 

593 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

594 """Add an instrument to the pipeline, or replace an instrument that is 

595 already defined. 

596 

597 Parameters 

598 ---------- 

599 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

600 Either a derived class object of a `lsst.daf.butler.instrument` or 

601 a string corresponding to a fully qualified 

602 `lsst.daf.butler.instrument` name. 

603 """ 

604 if isinstance(instrument, str): 

605 pass 

606 else: 

607 # TODO: assume that this is a subclass of Instrument, no type 

608 # checking 

609 instrument = get_full_type_name(instrument) 

610 self._pipelineIR.instrument = instrument 

611 

612 def getInstrument(self) -> Optional[str]: 

613 """Get the instrument from the pipeline. 

614 

615 Returns 

616 ------- 

617 instrument : `str`, or None 

618 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

619 name, or None if the pipeline does not have an instrument. 

620 """ 

621 return self._pipelineIR.instrument 

622 

623 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None: 

624 """Add a new task to the pipeline, or replace a task that is already 

625 associated with the supplied label. 

626 

627 Parameters 

628 ---------- 

629 task: `PipelineTask` or `str` 

630 Either a derived class object of a `PipelineTask` or a string 

631 corresponding to a fully qualified `PipelineTask` name. 

632 label: `str` 

633 A label that is used to identify the `PipelineTask` being added 

634 """ 

635 if isinstance(task, str): 

636 taskName = task 

637 elif issubclass(task, PipelineTask): 

638 taskName = get_full_type_name(task) 

639 else: 

640 raise ValueError( 

641 "task must be either a child class of PipelineTask or a string containing" 

642 " a fully qualified name to one" 

643 ) 

644 if not label: 

645 # in some cases (with command line-generated pipeline) tasks can 

646 # be defined without label which is not acceptable, use task 

647 # _DefaultName in that case 

648 if isinstance(task, str): 

649 task_class = doImportType(task) 

650 label = task_class._DefaultName 

651 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

652 

653 def removeTask(self, label: str) -> None: 

654 """Remove a task from the pipeline. 

655 

656 Parameters 

657 ---------- 

658 label : `str` 

659 The label used to identify the task that is to be removed 

660 

661 Raises 

662 ------ 

663 KeyError 

664 If no task with that label exists in the pipeline 

665 

666 """ 

667 self._pipelineIR.tasks.pop(label) 

668 

669 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

670 """Apply single config override. 

671 

672 Parameters 

673 ---------- 

674 label : `str` 

675 Label of the task. 

676 key: `str` 

677 Fully-qualified field name. 

678 value : object 

679 Value to be given to a field. 

680 """ 

681 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

682 

683 def addConfigFile(self, label: str, filename: str) -> None: 

684 """Add overrides from a specified file. 

685 

686 Parameters 

687 ---------- 

688 label : `str` 

689 The label used to identify the task associated with config to 

690 modify 

691 filename : `str` 

692 Path to the override file. 

693 """ 

694 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

695 

696 def addConfigPython(self, label: str, pythonString: str) -> None: 

697 """Add Overrides by running a snippet of python code against a config. 

698 

699 Parameters 

700 ---------- 

701 label : `str` 

702 The label used to identity the task associated with config to 

703 modify. 

704 pythonString: `str` 

705 A string which is valid python code to be executed. This is done 

706 with config as the only local accessible value. 

707 """ 

708 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

709 

710 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

711 if label == "parameters": 

712 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

713 raise ValueError("Cannot override parameters that are not defined in pipeline") 

714 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

715 if newConfig.file: 

716 raise ValueError("Setting parameters section with config file is not supported") 

717 if newConfig.python: 

718 raise ValueError("Setting parameters section using python block in unsupported") 

719 return 

720 if label not in self._pipelineIR.tasks: 

721 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

722 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

723 

724 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

725 """Write the pipeline to a file or directory. 

726 

727 Parameters 

728 ---------- 

729 uri : convertible to `ResourcePath` 

730 URI to write to; may have any scheme with `ResourcePath` write 

731 support or no scheme for a local file/directory. Should have a 

732 ``.yaml``. 

733 """ 

734 self._pipelineIR.write_to_uri(uri) 

735 

736 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

737 """Returns a generator of TaskDefs which can be used to create quantum 

738 graphs. 

739 

740 Returns 

741 ------- 

742 generator : generator of `TaskDef` 

743 The generator returned will be the sorted iterator of tasks which 

744 are to be used in constructing a quantum graph. 

745 

746 Raises 

747 ------ 

748 NotImplementedError 

749 If a dataId is supplied in a config block. This is in place for 

750 future use 

751 """ 

752 taskDefs = [] 

753 for label in self._pipelineIR.tasks: 

754 taskDefs.append(self._buildTaskDef(label)) 

755 

756 # lets evaluate the contracts 

757 if self._pipelineIR.contracts is not None: 

758 label_to_config = {x.label: x.config for x in taskDefs} 

759 for contract in self._pipelineIR.contracts: 

760 # execute this in its own line so it can raise a good error 

761 # message if there was problems with the eval 

762 success = eval(contract.contract, None, label_to_config) 

763 if not success: 

764 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

765 raise pipelineIR.ContractError( 

766 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

767 ) 

768 

769 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

770 yield from pipeTools.orderPipeline(taskDefs) 

771 

772 def _buildTaskDef(self, label: str) -> TaskDef: 

773 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

774 raise NameError(f"Label {label} does not appear in this pipeline") 

775 taskClass: Type[PipelineTask] = doImportType(taskIR.klass) 

776 taskName = get_full_type_name(taskClass) 

777 config = taskClass.ConfigClass() 

778 overrides = ConfigOverrides() 

779 if self._pipelineIR.instrument is not None: 

780 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

781 if taskIR.config is not None: 

782 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config): 

783 if configIR.dataId is not None: 

784 raise NotImplementedError( 

785 "Specializing a config on a partial data id is not yet " 

786 "supported in Pipeline definition" 

787 ) 

788 # only apply override if it applies to everything 

789 if configIR.dataId is None: 

790 if configIR.file: 

791 for configFile in configIR.file: 

792 overrides.addFileOverride(os.path.expandvars(configFile)) 

793 if configIR.python is not None: 

794 overrides.addPythonOverride(configIR.python) 

795 for key, value in configIR.rest.items(): 

796 overrides.addValueOverride(key, value) 

797 overrides.applyTo(config) 

798 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

799 

800 def __iter__(self) -> Generator[TaskDef, None, None]: 

801 return self.toExpandedPipeline() 

802 

803 def __getitem__(self, item: str) -> TaskDef: 

804 return self._buildTaskDef(item) 

805 

806 def __len__(self) -> int: 

807 return len(self._pipelineIR.tasks) 

808 

809 def __eq__(self, other: object) -> bool: 

810 if not isinstance(other, Pipeline): 

811 return False 

812 elif self._pipelineIR == other._pipelineIR: 

813 # Shortcut: if the IR is the same, the expanded pipeline must be 

814 # the same as well. But the converse is not true. 

815 return True 

816 else: 

817 self_expanded = {td.label: (td.taskClass,) for td in self} 

818 other_expanded = {td.label: (td.taskClass,) for td in other} 

819 if self_expanded != other_expanded: 

820 return False 

821 # After DM-27847, we should compare configuration here, or better, 

822 # delegated to TaskDef.__eq__ after making that compare configurations. 

823 raise NotImplementedError( 

824 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847." 

825 ) 

826 

827 

828@dataclass(frozen=True) 

829class TaskDatasetTypes: 

830 """An immutable struct that extracts and classifies the dataset types used 

831 by a `PipelineTask` 

832 """ 

833 

834 initInputs: NamedValueSet[DatasetType] 

835 """Dataset types that are needed as inputs in order to construct this Task. 

836 

837 Task-level `initInputs` may be classified as either 

838 `~PipelineDatasetTypes.initInputs` or 

839 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

840 """ 

841 

842 initOutputs: NamedValueSet[DatasetType] 

843 """Dataset types that may be written after constructing this Task. 

844 

845 Task-level `initOutputs` may be classified as either 

846 `~PipelineDatasetTypes.initOutputs` or 

847 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

848 """ 

849 

850 inputs: NamedValueSet[DatasetType] 

851 """Dataset types that are regular inputs to this Task. 

852 

853 If an input dataset needed for a Quantum cannot be found in the input 

854 collection(s) or produced by another Task in the Pipeline, that Quantum 

855 (and all dependent Quanta) will not be produced. 

856 

857 Task-level `inputs` may be classified as either 

858 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

859 at the Pipeline level. 

860 """ 

861 

862 prerequisites: NamedValueSet[DatasetType] 

863 """Dataset types that are prerequisite inputs to this Task. 

864 

865 Prerequisite inputs must exist in the input collection(s) before the 

866 pipeline is run, but do not constrain the graph - if a prerequisite is 

867 missing for a Quantum, `PrerequisiteMissingError` is raised. 

868 

869 Prerequisite inputs are not resolved until the second stage of 

870 QuantumGraph generation. 

871 """ 

872 

873 outputs: NamedValueSet[DatasetType] 

874 """Dataset types that are produced by this Task. 

875 

876 Task-level `outputs` may be classified as either 

877 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

878 at the Pipeline level. 

879 """ 

880 

881 @classmethod 

882 def fromTaskDef( 

883 cls, 

884 taskDef: TaskDef, 

885 *, 

886 registry: Registry, 

887 include_configs: bool = True, 

888 storage_class_mapping: Optional[Mapping[str, str]] = None, 

889 ) -> TaskDatasetTypes: 

890 """Extract and classify the dataset types from a single `PipelineTask`. 

891 

892 Parameters 

893 ---------- 

894 taskDef: `TaskDef` 

895 An instance of a `TaskDef` class for a particular `PipelineTask`. 

896 registry: `Registry` 

897 Registry used to construct normalized `DatasetType` objects and 

898 retrieve those that are incomplete. 

899 include_configs : `bool`, optional 

900 If `True` (default) include config dataset types as 

901 ``initOutputs``. 

902 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

903 If a taskdef contains a component dataset type that is unknown 

904 to the registry, its parent StorageClass will be looked up in this 

905 mapping if it is supplied. If the mapping does not contain the 

906 composite dataset type, or the mapping is not supplied an exception 

907 will be raised. 

908 

909 Returns 

910 ------- 

911 types: `TaskDatasetTypes` 

912 The dataset types used by this task. 

913 

914 Raises 

915 ------ 

916 ValueError 

917 Raised if dataset type connection definition differs from 

918 registry definition. 

919 LookupError 

920 Raised if component parent StorageClass could not be determined 

921 and storage_class_mapping does not contain the composite type, or 

922 is set to None. 

923 """ 

924 

925 def makeDatasetTypesSet( 

926 connectionType: str, 

927 is_input: bool, 

928 freeze: bool = True, 

929 ) -> NamedValueSet[DatasetType]: 

930 """Constructs a set of true `DatasetType` objects 

931 

932 Parameters 

933 ---------- 

934 connectionType : `str` 

935 Name of the connection type to produce a set for, corresponds 

936 to an attribute of type `list` on the connection class instance 

937 is_input : `bool` 

938 These are input dataset types, else they are output dataset 

939 types. 

940 freeze : `bool`, optional 

941 If `True`, call `NamedValueSet.freeze` on the object returned. 

942 

943 Returns 

944 ------- 

945 datasetTypes : `NamedValueSet` 

946 A set of all datasetTypes which correspond to the input 

947 connection type specified in the connection class of this 

948 `PipelineTask` 

949 

950 Raises 

951 ------ 

952 ValueError 

953 Raised if dataset type connection definition differs from 

954 registry definition. 

955 LookupError 

956 Raised if component parent StorageClass could not be determined 

957 and storage_class_mapping does not contain the composite type, 

958 or is set to None. 

959 

960 Notes 

961 ----- 

962 This function is a closure over the variables ``registry`` and 

963 ``taskDef``, and ``storage_class_mapping``. 

964 """ 

965 datasetTypes = NamedValueSet[DatasetType]() 

966 for c in iterConnections(taskDef.connections, connectionType): 

967 dimensions = set(getattr(c, "dimensions", set())) 

968 if "skypix" in dimensions: 

969 try: 

970 datasetType = registry.getDatasetType(c.name) 

971 except LookupError as err: 

972 raise LookupError( 

973 f"DatasetType '{c.name}' referenced by " 

974 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

975 "placeholder, but does not already exist in the registry. " 

976 "Note that reference catalog names are now used as the dataset " 

977 "type name instead of 'ref_cat'." 

978 ) from err 

979 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

980 rest2 = set( 

981 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

982 ) 

983 if rest1 != rest2: 

984 raise ValueError( 

985 f"Non-skypix dimensions for dataset type {c.name} declared in " 

986 f"connections ({rest1}) are inconsistent with those in " 

987 f"registry's version of this dataset ({rest2})." 

988 ) 

989 else: 

990 # Component dataset types are not explicitly in the 

991 # registry. This complicates consistency checks with 

992 # registry and requires we work out the composite storage 

993 # class. 

994 registryDatasetType = None 

995 try: 

996 registryDatasetType = registry.getDatasetType(c.name) 

997 except KeyError: 

998 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

999 if componentName: 

1000 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

1001 raise LookupError( 

1002 "Component parent class cannot be determined, and " 

1003 "composite name was not in storage class mapping, or no " 

1004 "storage_class_mapping was supplied" 

1005 ) 

1006 else: 

1007 parentStorageClass = storage_class_mapping[compositeName] 

1008 else: 

1009 parentStorageClass = None 

1010 datasetType = c.makeDatasetType( 

1011 registry.dimensions, parentStorageClass=parentStorageClass 

1012 ) 

1013 registryDatasetType = datasetType 

1014 else: 

1015 datasetType = c.makeDatasetType( 

1016 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

1017 ) 

1018 

1019 if registryDatasetType and datasetType != registryDatasetType: 

1020 # The dataset types differ but first check to see if 

1021 # they are compatible before raising. 

1022 if is_input: 

1023 # This DatasetType must be compatible on get. 

1024 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

1025 else: 

1026 # Has to be able to be converted to expect type 

1027 # on put. 

1028 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

1029 if is_compatible: 

1030 # For inputs we want the pipeline to use the 

1031 # pipeline definition, for outputs it should use 

1032 # the registry definition. 

1033 if not is_input: 

1034 datasetType = registryDatasetType 

1035 _LOG.debug( 

1036 "Dataset types differ (task %s != registry %s) but are compatible" 

1037 " for %s in %s.", 

1038 datasetType, 

1039 registryDatasetType, 

1040 "input" if is_input else "output", 

1041 taskDef.label, 

1042 ) 

1043 else: 

1044 try: 

1045 # Explicitly check for storage class just to 

1046 # make more specific message. 

1047 _ = datasetType.storageClass 

1048 except KeyError: 

1049 raise ValueError( 

1050 "Storage class does not exist for supplied dataset type " 

1051 f"{datasetType} for {taskDef.label}." 

1052 ) from None 

1053 raise ValueError( 

1054 f"Supplied dataset type ({datasetType}) inconsistent with " 

1055 f"registry definition ({registryDatasetType}) " 

1056 f"for {taskDef.label}." 

1057 ) 

1058 datasetTypes.add(datasetType) 

1059 if freeze: 

1060 datasetTypes.freeze() 

1061 return datasetTypes 

1062 

1063 # optionally add initOutput dataset for config 

1064 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

1065 if include_configs: 

1066 initOutputs.add( 

1067 DatasetType( 

1068 taskDef.configDatasetName, 

1069 registry.dimensions.empty, 

1070 storageClass="Config", 

1071 ) 

1072 ) 

1073 initOutputs.freeze() 

1074 

1075 # optionally add output dataset for metadata 

1076 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

1077 if taskDef.metadataDatasetName is not None: 

1078 # Metadata is supposed to be of the TaskMetadata type, its 

1079 # dimensions correspond to a task quantum. 

1080 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1081 

1082 # Allow the storage class definition to be read from the existing 

1083 # dataset type definition if present. 

1084 try: 

1085 current = registry.getDatasetType(taskDef.metadataDatasetName) 

1086 except KeyError: 

1087 # No previous definition so use the default. 

1088 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

1089 else: 

1090 storageClass = current.storageClass.name 

1091 

1092 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

1093 if taskDef.logOutputDatasetName is not None: 

1094 # Log output dimensions correspond to a task quantum. 

1095 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1096 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}) 

1097 

1098 outputs.freeze() 

1099 

1100 return cls( 

1101 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

1102 initOutputs=initOutputs, 

1103 inputs=makeDatasetTypesSet("inputs", is_input=True), 

1104 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

1105 outputs=outputs, 

1106 ) 

1107 

1108 

1109@dataclass(frozen=True) 

1110class PipelineDatasetTypes: 

1111 """An immutable struct that classifies the dataset types used in a 

1112 `Pipeline`. 

1113 """ 

1114 

1115 packagesDatasetName: ClassVar[str] = "packages" 

1116 """Name of a dataset type used to save package versions. 

1117 """ 

1118 

1119 initInputs: NamedValueSet[DatasetType] 

1120 """Dataset types that are needed as inputs in order to construct the Tasks 

1121 in this Pipeline. 

1122 

1123 This does not include dataset types that are produced when constructing 

1124 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1125 """ 

1126 

1127 initOutputs: NamedValueSet[DatasetType] 

1128 """Dataset types that may be written after constructing the Tasks in this 

1129 Pipeline. 

1130 

1131 This does not include dataset types that are also used as inputs when 

1132 constructing other Tasks in the Pipeline (these are classified as 

1133 `initIntermediates`). 

1134 """ 

1135 

1136 initIntermediates: NamedValueSet[DatasetType] 

1137 """Dataset types that are both used when constructing one or more Tasks 

1138 in the Pipeline and produced as a side-effect of constructing another 

1139 Task in the Pipeline. 

1140 """ 

1141 

1142 inputs: NamedValueSet[DatasetType] 

1143 """Dataset types that are regular inputs for the full pipeline. 

1144 

1145 If an input dataset needed for a Quantum cannot be found in the input 

1146 collection(s), that Quantum (and all dependent Quanta) will not be 

1147 produced. 

1148 """ 

1149 

1150 prerequisites: NamedValueSet[DatasetType] 

1151 """Dataset types that are prerequisite inputs for the full Pipeline. 

1152 

1153 Prerequisite inputs must exist in the input collection(s) before the 

1154 pipeline is run, but do not constrain the graph - if a prerequisite is 

1155 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1156 

1157 Prerequisite inputs are not resolved until the second stage of 

1158 QuantumGraph generation. 

1159 """ 

1160 

1161 intermediates: NamedValueSet[DatasetType] 

1162 """Dataset types that are output by one Task in the Pipeline and consumed 

1163 as inputs by one or more other Tasks in the Pipeline. 

1164 """ 

1165 

1166 outputs: NamedValueSet[DatasetType] 

1167 """Dataset types that are output by a Task in the Pipeline and not consumed 

1168 by any other Task in the Pipeline. 

1169 """ 

1170 

1171 byTask: Mapping[str, TaskDatasetTypes] 

1172 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1173 

1174 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1175 neither has been modified since the dataset types were extracted, of 

1176 course). 

1177 """ 

1178 

1179 @classmethod 

1180 def fromPipeline( 

1181 cls, 

1182 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1183 *, 

1184 registry: Registry, 

1185 include_configs: bool = True, 

1186 include_packages: bool = True, 

1187 ) -> PipelineDatasetTypes: 

1188 """Extract and classify the dataset types from all tasks in a 

1189 `Pipeline`. 

1190 

1191 Parameters 

1192 ---------- 

1193 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1194 A collection of tasks that can be run together. 

1195 registry: `Registry` 

1196 Registry used to construct normalized `DatasetType` objects and 

1197 retrieve those that are incomplete. 

1198 include_configs : `bool`, optional 

1199 If `True` (default) include config dataset types as 

1200 ``initOutputs``. 

1201 include_packages : `bool`, optional 

1202 If `True` (default) include the dataset type for software package 

1203 versions in ``initOutputs``. 

1204 

1205 Returns 

1206 ------- 

1207 types: `PipelineDatasetTypes` 

1208 The dataset types used by this `Pipeline`. 

1209 

1210 Raises 

1211 ------ 

1212 ValueError 

1213 Raised if Tasks are inconsistent about which datasets are marked 

1214 prerequisite. This indicates that the Tasks cannot be run as part 

1215 of the same `Pipeline`. 

1216 """ 

1217 allInputs = NamedValueSet[DatasetType]() 

1218 allOutputs = NamedValueSet[DatasetType]() 

1219 allInitInputs = NamedValueSet[DatasetType]() 

1220 allInitOutputs = NamedValueSet[DatasetType]() 

1221 prerequisites = NamedValueSet[DatasetType]() 

1222 byTask = dict() 

1223 if include_packages: 

1224 allInitOutputs.add( 

1225 DatasetType( 

1226 cls.packagesDatasetName, 

1227 registry.dimensions.empty, 

1228 storageClass="Packages", 

1229 ) 

1230 ) 

1231 # create a list of TaskDefs in case the input is a generator 

1232 pipeline = list(pipeline) 

1233 

1234 # collect all the output dataset types 

1235 typeStorageclassMap: Dict[str, str] = {} 

1236 for taskDef in pipeline: 

1237 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1238 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1239 

1240 for taskDef in pipeline: 

1241 thisTask = TaskDatasetTypes.fromTaskDef( 

1242 taskDef, 

1243 registry=registry, 

1244 include_configs=include_configs, 

1245 storage_class_mapping=typeStorageclassMap, 

1246 ) 

1247 allInitInputs.update(thisTask.initInputs) 

1248 allInitOutputs.update(thisTask.initOutputs) 

1249 allInputs.update(thisTask.inputs) 

1250 prerequisites.update(thisTask.prerequisites) 

1251 allOutputs.update(thisTask.outputs) 

1252 byTask[taskDef.label] = thisTask 

1253 if not prerequisites.isdisjoint(allInputs): 

1254 raise ValueError( 

1255 "{} marked as both prerequisites and regular inputs".format( 

1256 {dt.name for dt in allInputs & prerequisites} 

1257 ) 

1258 ) 

1259 if not prerequisites.isdisjoint(allOutputs): 

1260 raise ValueError( 

1261 "{} marked as both prerequisites and outputs".format( 

1262 {dt.name for dt in allOutputs & prerequisites} 

1263 ) 

1264 ) 

1265 # Make sure that components which are marked as inputs get treated as 

1266 # intermediates if there is an output which produces the composite 

1267 # containing the component 

1268 intermediateComponents = NamedValueSet[DatasetType]() 

1269 intermediateComposites = NamedValueSet[DatasetType]() 

1270 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1271 for dsType in allInputs: 

1272 # get the name of a possible component 

1273 name, component = dsType.nameAndComponent() 

1274 # if there is a component name, that means this is a component 

1275 # DatasetType, if there is an output which produces the parent of 

1276 # this component, treat this input as an intermediate 

1277 if component is not None: 

1278 # This needs to be in this if block, because someone might have 

1279 # a composite that is a pure input from existing data 

1280 if name in outputNameMapping: 

1281 intermediateComponents.add(dsType) 

1282 intermediateComposites.add(outputNameMapping[name]) 

1283 

1284 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1285 common = a.names & b.names 

1286 for name in common: 

1287 # Any compatibility is allowed. This function does not know 

1288 # if a dataset type is to be used for input or output. 

1289 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1290 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1291 

1292 checkConsistency(allInitInputs, allInitOutputs) 

1293 checkConsistency(allInputs, allOutputs) 

1294 checkConsistency(allInputs, intermediateComposites) 

1295 checkConsistency(allOutputs, intermediateComposites) 

1296 

1297 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]: 

1298 assert isinstance(s, NamedValueSet) 

1299 s.freeze() 

1300 return s 

1301 

1302 return cls( 

1303 initInputs=frozen(allInitInputs - allInitOutputs), 

1304 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1305 initOutputs=frozen(allInitOutputs - allInitInputs), 

1306 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1307 # If there are storage class differences in inputs and outputs 

1308 # the intermediates have to choose priority. Here choose that 

1309 # inputs to tasks much match the requested storage class by 

1310 # applying the inputs over the top of the outputs. 

1311 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1312 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1313 prerequisites=frozen(prerequisites), 

1314 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1315 ) 

1316 

1317 @classmethod 

1318 def initOutputNames( 

1319 cls, 

1320 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1321 *, 

1322 include_configs: bool = True, 

1323 include_packages: bool = True, 

1324 ) -> Iterator[str]: 

1325 """Return the names of dataset types ot task initOutputs, Configs, 

1326 and package versions for a pipeline. 

1327 

1328 Parameters 

1329 ---------- 

1330 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1331 A `Pipeline` instance or collection of `TaskDef` instances. 

1332 include_configs : `bool`, optional 

1333 If `True` (default) include config dataset types. 

1334 include_packages : `bool`, optional 

1335 If `True` (default) include the dataset type for package versions. 

1336 

1337 Yields 

1338 ------ 

1339 datasetTypeName : `str` 

1340 Name of the dataset type. 

1341 """ 

1342 if include_packages: 

1343 # Package versions dataset type 

1344 yield cls.packagesDatasetName 

1345 

1346 if isinstance(pipeline, Pipeline): 

1347 pipeline = pipeline.toExpandedPipeline() 

1348 

1349 for taskDef in pipeline: 

1350 # all task InitOutputs 

1351 for name in taskDef.connections.initOutputs: 

1352 attribute = getattr(taskDef.connections, name) 

1353 yield attribute.name 

1354 

1355 # config dataset name 

1356 if include_configs: 

1357 yield taskDef.configDatasetName