Coverage for python/lsst/pipe/base/pipeline.py: 21%

437 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-11 03:12 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import re 

31import urllib.parse 

32 

33# ------------------------------- 

34# Imports of standard modules -- 

35# ------------------------------- 

36from dataclasses import dataclass 

37from types import MappingProxyType 

38from typing import ( 

39 TYPE_CHECKING, 

40 AbstractSet, 

41 Callable, 

42 ClassVar, 

43 Dict, 

44 Generator, 

45 Iterable, 

46 Iterator, 

47 Mapping, 

48 Optional, 

49 Set, 

50 Tuple, 

51 Type, 

52 Union, 

53 cast, 

54) 

55 

56# ----------------------------- 

57# Imports for other modules -- 

58from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_full_type_name 

62 

63from . import pipelineIR, pipeTools 

64from ._instrument import Instrument as PipeBaseInstrument 

65from ._task_metadata import TaskMetadata 

66from .config import PipelineTaskConfig 

67from .connections import iterConnections 

68from .connectionTypes import Input 

69from .pipelineTask import PipelineTask 

70from .task import _TASK_METADATA_TYPE 

71 

72if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true

73 from lsst.obs.base import Instrument 

74 from lsst.pex.config import Config 

75 

76# ---------------------------------- 

77# Local non-exported definitions -- 

78# ---------------------------------- 

79 

80_LOG = logging.getLogger(__name__) 

81 

82# ------------------------ 

83# Exported definitions -- 

84# ------------------------ 

85 

86 

87@dataclass 

88class LabelSpecifier: 

89 """A structure to specify a subset of labels to load 

90 

91 This structure may contain a set of labels to be used in subsetting a 

92 pipeline, or a beginning and end point. Beginning or end may be empty, 

93 in which case the range will be a half open interval. Unlike python 

94 iteration bounds, end bounds are *INCLUDED*. Note that range based 

95 selection is not well defined for pipelines that are not linear in nature, 

96 and correct behavior is not guaranteed, or may vary from run to run. 

97 """ 

98 

99 labels: Optional[Set[str]] = None 

100 begin: Optional[str] = None 

101 end: Optional[str] = None 

102 

103 def __post_init__(self) -> None: 

104 if self.labels is not None and (self.begin or self.end): 

105 raise ValueError( 

106 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

107 ) 

108 

109 

110class TaskDef: 

111 """TaskDef is a collection of information about task needed by Pipeline. 

112 

113 The information includes task name, configuration object and optional 

114 task class. This class is just a collection of attributes and it exposes 

115 all of them so that attributes could potentially be modified in place 

116 (e.g. if configuration needs extra overrides). 

117 

118 Attributes 

119 ---------- 

120 taskName : `str`, optional 

121 The fully-qualified `PipelineTask` class name. If not provided, 

122 ``taskClass`` must be. 

123 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional 

124 Instance of the configuration class corresponding to this task class, 

125 usually with all overrides applied. This config will be frozen. If 

126 not provided, ``taskClass`` must be provided and 

127 ``taskClass.ConfigClass()`` will be used. 

128 taskClass : `type`, optional 

129 `PipelineTask` class object; if provided and ``taskName`` is as well, 

130 the caller guarantees that they are consistent. If not provided, 

131 ``taskName`` is used to import the type. 

132 label : `str`, optional 

133 Task label, usually a short string unique in a pipeline. If not 

134 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

135 be used. 

136 """ 

137 

138 def __init__( 

139 self, 

140 taskName: Optional[str] = None, 

141 config: Optional[PipelineTaskConfig] = None, 

142 taskClass: Optional[Type[PipelineTask]] = None, 

143 label: Optional[str] = None, 

144 ): 

145 if taskName is None: 

146 if taskClass is None: 

147 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

148 taskName = get_full_type_name(taskClass) 

149 elif taskClass is None: 

150 taskClass = doImportType(taskName) 

151 if config is None: 

152 if taskClass is None: 

153 raise ValueError("`taskClass` must be provided if `config` is not.") 

154 config = taskClass.ConfigClass() 

155 if label is None: 

156 if taskClass is None: 

157 raise ValueError("`taskClass` must be provided if `label` is not.") 

158 label = taskClass._DefaultName 

159 self.taskName = taskName 

160 try: 

161 config.validate() 

162 except Exception: 

163 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

164 raise 

165 config.freeze() 

166 self.config = config 

167 self.taskClass = taskClass 

168 self.label = label 

169 self.connections = config.connections.ConnectionsClass(config=config) 

170 

171 @property 

172 def configDatasetName(self) -> str: 

173 """Name of a dataset type for configuration of this task (`str`)""" 

174 return self.label + "_config" 

175 

176 @property 

177 def metadataDatasetName(self) -> Optional[str]: 

178 """Name of a dataset type for metadata of this task, `None` if 

179 metadata is not to be saved (`str`) 

180 """ 

181 if self.config.saveMetadata: 

182 return self.makeMetadataDatasetName(self.label) 

183 else: 

184 return None 

185 

186 @classmethod 

187 def makeMetadataDatasetName(cls, label: str) -> str: 

188 """Construct the name of the dataset type for metadata for a task. 

189 

190 Parameters 

191 ---------- 

192 label : `str` 

193 Label for the task within its pipeline. 

194 

195 Returns 

196 ------- 

197 name : `str` 

198 Name of the task's metadata dataset type. 

199 """ 

200 return f"{label}_metadata" 

201 

202 @property 

203 def logOutputDatasetName(self) -> Optional[str]: 

204 """Name of a dataset type for log output from this task, `None` if 

205 logs are not to be saved (`str`) 

206 """ 

207 if cast(PipelineTaskConfig, self.config).saveLogOutput: 

208 return self.label + "_log" 

209 else: 

210 return None 

211 

212 def __str__(self) -> str: 

213 rep = "TaskDef(" + self.taskName 

214 if self.label: 

215 rep += ", label=" + self.label 

216 rep += ")" 

217 return rep 

218 

219 def __eq__(self, other: object) -> bool: 

220 if not isinstance(other, TaskDef): 

221 return False 

222 # This does not consider equality of configs when determining equality 

223 # as config equality is a difficult thing to define. Should be updated 

224 # after DM-27847 

225 return self.taskClass == other.taskClass and self.label == other.label 

226 

227 def __hash__(self) -> int: 

228 return hash((self.taskClass, self.label)) 

229 

230 @classmethod 

231 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef: 

232 """Custom callable for unpickling. 

233 

234 All arguments are forwarded directly to the constructor; this 

235 trampoline is only needed because ``__reduce__`` callables can't be 

236 called with keyword arguments. 

237 """ 

238 return cls(taskName=taskName, config=config, label=label) 

239 

240 def __reduce__(self) -> Tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], Tuple[str, Config, str]]: 

241 return (self._unreduce, (self.taskName, self.config, self.label)) 

242 

243 

244class Pipeline: 

245 """A `Pipeline` is a representation of a series of tasks to run, and the 

246 configuration for those tasks. 

247 

248 Parameters 

249 ---------- 

250 description : `str` 

251 A description of that this pipeline does. 

252 """ 

253 

254 def __init__(self, description: str): 

255 pipeline_dict = {"description": description, "tasks": {}} 

256 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

257 

258 @classmethod 

259 def fromFile(cls, filename: str) -> Pipeline: 

260 """Load a pipeline defined in a pipeline yaml file. 

261 

262 Parameters 

263 ---------- 

264 filename: `str` 

265 A path that points to a pipeline defined in yaml format. This 

266 filename may also supply additional labels to be used in 

267 subsetting the loaded Pipeline. These labels are separated from 

268 the path by a \\#, and may be specified as a comma separated 

269 list, or a range denoted as beginning..end. Beginning or end may 

270 be empty, in which case the range will be a half open interval. 

271 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

272 that range based selection is not well defined for pipelines that 

273 are not linear in nature, and correct behavior is not guaranteed, 

274 or may vary from run to run. 

275 

276 Returns 

277 ------- 

278 pipeline: `Pipeline` 

279 The pipeline loaded from specified location with appropriate (if 

280 any) subsetting 

281 

282 Notes 

283 ----- 

284 This method attempts to prune any contracts that contain labels which 

285 are not in the declared subset of labels. This pruning is done using a 

286 string based matching due to the nature of contracts and may prune more 

287 than it should. 

288 """ 

289 return cls.from_uri(filename) 

290 

291 @classmethod 

292 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

293 """Load a pipeline defined in a pipeline yaml file at a location 

294 specified by a URI. 

295 

296 Parameters 

297 ---------- 

298 uri: convertible to `ResourcePath` 

299 If a string is supplied this should be a URI path that points to a 

300 pipeline defined in yaml format, either as a direct path to the 

301 yaml file, or as a directory containing a "pipeline.yaml" file (the 

302 form used by `write_to_uri` with ``expand=True``). This uri may 

303 also supply additional labels to be used in subsetting the loaded 

304 Pipeline. These labels are separated from the path by a \\#, and 

305 may be specified as a comma separated list, or a range denoted as 

306 beginning..end. Beginning or end may be empty, in which case the 

307 range will be a half open interval. Unlike python iteration bounds, 

308 end bounds are *INCLUDED*. Note that range based selection is not 

309 well defined for pipelines that are not linear in nature, and 

310 correct behavior is not guaranteed, or may vary from run to run. 

311 The same specifiers can be used with a `ResourcePath` object, by 

312 being the sole contents in the fragments attribute. 

313 

314 Returns 

315 ------- 

316 pipeline: `Pipeline` 

317 The pipeline loaded from specified location with appropriate (if 

318 any) subsetting 

319 

320 Notes 

321 ----- 

322 This method attempts to prune any contracts that contain labels which 

323 are not in the declared subset of labels. This pruning is done using a 

324 string based matching due to the nature of contracts and may prune more 

325 than it should. 

326 """ 

327 # Split up the uri and any labels that were supplied 

328 uri, label_specifier = cls._parse_file_specifier(uri) 

329 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

330 

331 # If there are labels supplied, only keep those 

332 if label_specifier is not None: 

333 pipeline = pipeline.subsetFromLabels(label_specifier) 

334 return pipeline 

335 

336 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

337 """Subset a pipeline to contain only labels specified in labelSpecifier 

338 

339 Parameters 

340 ---------- 

341 labelSpecifier : `labelSpecifier` 

342 Object containing labels that describes how to subset a pipeline. 

343 

344 Returns 

345 ------- 

346 pipeline : `Pipeline` 

347 A new pipeline object that is a subset of the old pipeline 

348 

349 Raises 

350 ------ 

351 ValueError 

352 Raised if there is an issue with specified labels 

353 

354 Notes 

355 ----- 

356 This method attempts to prune any contracts that contain labels which 

357 are not in the declared subset of labels. This pruning is done using a 

358 string based matching due to the nature of contracts and may prune more 

359 than it should. 

360 """ 

361 # Labels supplied as a set 

362 if labelSpecifier.labels: 

363 labelSet = labelSpecifier.labels 

364 # Labels supplied as a range, first create a list of all the labels 

365 # in the pipeline sorted according to task dependency. Then only 

366 # keep labels that lie between the supplied bounds 

367 else: 

368 # Create a copy of the pipeline to use when assessing the label 

369 # ordering. Use a dict for fast searching while preserving order. 

370 # Remove contracts so they do not fail in the expansion step. This 

371 # is needed because a user may only configure the tasks they intend 

372 # to run, which may cause some contracts to fail if they will later 

373 # be dropped 

374 pipeline = copy.deepcopy(self) 

375 pipeline._pipelineIR.contracts = [] 

376 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

377 

378 # Verify the bounds are in the labels 

379 if labelSpecifier.begin is not None: 

380 if labelSpecifier.begin not in labels: 

381 raise ValueError( 

382 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition" 

383 ) 

384 if labelSpecifier.end is not None: 

385 if labelSpecifier.end not in labels: 

386 raise ValueError( 

387 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

388 ) 

389 

390 labelSet = set() 

391 for label in labels: 

392 if labelSpecifier.begin is not None: 

393 if label != labelSpecifier.begin: 

394 continue 

395 else: 

396 labelSpecifier.begin = None 

397 labelSet.add(label) 

398 if labelSpecifier.end is not None and label == labelSpecifier.end: 

399 break 

400 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

401 

402 @staticmethod 

403 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]: 

404 """Split appart a uri and any possible label subsets""" 

405 if isinstance(uri, str): 

406 # This is to support legacy pipelines during transition 

407 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

408 if num_replace: 

409 raise ValueError( 

410 f"The pipeline file {uri} seems to use the legacy :" 

411 " to separate labels, please use # instead." 

412 ) 

413 if uri.count("#") > 1: 

414 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

415 # Everything else can be converted directly to ResourcePath. 

416 uri = ResourcePath(uri) 

417 label_subset = uri.fragment or None 

418 

419 specifier: Optional[LabelSpecifier] 

420 if label_subset is not None: 

421 label_subset = urllib.parse.unquote(label_subset) 

422 args: Dict[str, Union[Set[str], str, None]] 

423 # labels supplied as a list 

424 if "," in label_subset: 

425 if ".." in label_subset: 

426 raise ValueError( 

427 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

428 ) 

429 args = {"labels": set(label_subset.split(","))} 

430 # labels supplied as a range 

431 elif ".." in label_subset: 

432 # Try to de-structure the labelSubset, this will fail if more 

433 # than one range is specified 

434 begin, end, *rest = label_subset.split("..") 

435 if rest: 

436 raise ValueError("Only one range can be specified when loading a pipeline") 

437 args = {"begin": begin if begin else None, "end": end if end else None} 

438 # Assume anything else is a single label 

439 else: 

440 args = {"labels": {label_subset}} 

441 

442 # MyPy doesn't like how cavalier kwarg construction is with types. 

443 specifier = LabelSpecifier(**args) # type: ignore 

444 else: 

445 specifier = None 

446 

447 return uri, specifier 

448 

449 @classmethod 

450 def fromString(cls, pipeline_string: str) -> Pipeline: 

451 """Create a pipeline from string formatted as a pipeline document. 

452 

453 Parameters 

454 ---------- 

455 pipeline_string : `str` 

456 A string that is formatted according like a pipeline document 

457 

458 Returns 

459 ------- 

460 pipeline: `Pipeline` 

461 """ 

462 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

463 return pipeline 

464 

465 @classmethod 

466 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

467 """Create a pipeline from an already created `PipelineIR` object. 

468 

469 Parameters 

470 ---------- 

471 deserialized_pipeline: `PipelineIR` 

472 An already created pipeline intermediate representation object 

473 

474 Returns 

475 ------- 

476 pipeline: `Pipeline` 

477 """ 

478 pipeline = cls.__new__(cls) 

479 pipeline._pipelineIR = deserialized_pipeline 

480 return pipeline 

481 

482 @classmethod 

483 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

484 """Create a new pipeline by copying an already existing `Pipeline`. 

485 

486 Parameters 

487 ---------- 

488 pipeline: `Pipeline` 

489 An already created pipeline intermediate representation object 

490 

491 Returns 

492 ------- 

493 pipeline: `Pipeline` 

494 """ 

495 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

496 

497 def __str__(self) -> str: 

498 return str(self._pipelineIR) 

499 

500 def mergePipeline(self, pipeline: Pipeline) -> None: 

501 """Merge another in-memory `Pipeline` object into this one. 

502 

503 This merges another pipeline into this object, as if it were declared 

504 in the import block of the yaml definition of this pipeline. This 

505 modifies this pipeline in place. 

506 

507 Parameters 

508 ---------- 

509 pipeline : `Pipeline` 

510 The `Pipeline` object that is to be merged into this object. 

511 """ 

512 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,)) 

513 

514 def addLabelToSubset(self, subset: str, label: str) -> None: 

515 """Add a task label from the specified subset. 

516 

517 Parameters 

518 ---------- 

519 subset : `str` 

520 The labeled subset to modify 

521 label : `str` 

522 The task label to add to the specified subset. 

523 

524 Raises 

525 ------ 

526 ValueError 

527 Raised if the specified subset does not exist within the pipeline. 

528 Raised if the specified label does not exist within the pipeline. 

529 """ 

530 if label not in self._pipelineIR.tasks: 

531 raise ValueError(f"Label {label} does not appear within the pipeline") 

532 if subset not in self._pipelineIR.labeled_subsets: 

533 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

534 self._pipelineIR.labeled_subsets[subset].subset.add(label) 

535 

536 def removeLabelFromSubset(self, subset: str, label: str) -> None: 

537 """Remove a task label from the specified subset. 

538 

539 Parameters 

540 ---------- 

541 subset : `str` 

542 The labeled subset to modify 

543 label : `str` 

544 The task label to remove from the specified subset. 

545 

546 Raises 

547 ------ 

548 ValueError 

549 Raised if the specified subset does not exist in the pipeline. 

550 Raised if the specified label does not exist within the specified 

551 subset. 

552 """ 

553 if subset not in self._pipelineIR.labeled_subsets: 

554 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

555 if label not in self._pipelineIR.labeled_subsets[subset].subset: 

556 raise ValueError(f"Label {label} does not appear within the pipeline") 

557 self._pipelineIR.labeled_subsets[subset].subset.remove(label) 

558 

559 def findSubsetsWithLabel(self, label: str) -> set[str]: 

560 """Find any subsets which may contain the specified label. 

561 

562 This function returns the name of subsets which return the specified 

563 label. May return an empty set if there are no subsets, or no subsets 

564 containing the specified label. 

565 

566 Parameters 

567 ---------- 

568 label : `str` 

569 The task label to use in membership check 

570 

571 Returns 

572 ------- 

573 subsets : `set` of `str` 

574 Returns a set (possibly empty) of subsets names which contain the 

575 specified label. 

576 

577 Raises 

578 ------ 

579 ValueError 

580 Raised if the specified label does not exist within this pipeline. 

581 """ 

582 results = set() 

583 if label not in self._pipelineIR.tasks: 

584 raise ValueError(f"Label {label} does not appear within the pipeline") 

585 for subset in self._pipelineIR.labeled_subsets.values(): 

586 if label in subset.subset: 

587 results.add(subset.label) 

588 return results 

589 

590 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

591 """Add an instrument to the pipeline, or replace an instrument that is 

592 already defined. 

593 

594 Parameters 

595 ---------- 

596 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

597 Either a derived class object of a `lsst.daf.butler.instrument` or 

598 a string corresponding to a fully qualified 

599 `lsst.daf.butler.instrument` name. 

600 """ 

601 if isinstance(instrument, str): 

602 pass 

603 else: 

604 # TODO: assume that this is a subclass of Instrument, no type 

605 # checking 

606 instrument = get_full_type_name(instrument) 

607 self._pipelineIR.instrument = instrument 

608 

609 def getInstrument(self) -> Optional[str]: 

610 """Get the instrument from the pipeline. 

611 

612 Returns 

613 ------- 

614 instrument : `str`, or None 

615 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

616 name, or None if the pipeline does not have an instrument. 

617 """ 

618 return self._pipelineIR.instrument 

619 

620 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None: 

621 """Add a new task to the pipeline, or replace a task that is already 

622 associated with the supplied label. 

623 

624 Parameters 

625 ---------- 

626 task: `PipelineTask` or `str` 

627 Either a derived class object of a `PipelineTask` or a string 

628 corresponding to a fully qualified `PipelineTask` name. 

629 label: `str` 

630 A label that is used to identify the `PipelineTask` being added 

631 """ 

632 if isinstance(task, str): 

633 taskName = task 

634 elif issubclass(task, PipelineTask): 

635 taskName = get_full_type_name(task) 

636 else: 

637 raise ValueError( 

638 "task must be either a child class of PipelineTask or a string containing" 

639 " a fully qualified name to one" 

640 ) 

641 if not label: 

642 # in some cases (with command line-generated pipeline) tasks can 

643 # be defined without label which is not acceptable, use task 

644 # _DefaultName in that case 

645 if isinstance(task, str): 

646 task_class = doImportType(task) 

647 label = task_class._DefaultName 

648 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

649 

650 def removeTask(self, label: str) -> None: 

651 """Remove a task from the pipeline. 

652 

653 Parameters 

654 ---------- 

655 label : `str` 

656 The label used to identify the task that is to be removed 

657 

658 Raises 

659 ------ 

660 KeyError 

661 If no task with that label exists in the pipeline 

662 

663 """ 

664 self._pipelineIR.tasks.pop(label) 

665 

666 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

667 """Apply single config override. 

668 

669 Parameters 

670 ---------- 

671 label : `str` 

672 Label of the task. 

673 key: `str` 

674 Fully-qualified field name. 

675 value : object 

676 Value to be given to a field. 

677 """ 

678 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

679 

680 def addConfigFile(self, label: str, filename: str) -> None: 

681 """Add overrides from a specified file. 

682 

683 Parameters 

684 ---------- 

685 label : `str` 

686 The label used to identify the task associated with config to 

687 modify 

688 filename : `str` 

689 Path to the override file. 

690 """ 

691 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

692 

693 def addConfigPython(self, label: str, pythonString: str) -> None: 

694 """Add Overrides by running a snippet of python code against a config. 

695 

696 Parameters 

697 ---------- 

698 label : `str` 

699 The label used to identity the task associated with config to 

700 modify. 

701 pythonString: `str` 

702 A string which is valid python code to be executed. This is done 

703 with config as the only local accessible value. 

704 """ 

705 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

706 

707 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

708 if label == "parameters": 

709 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

710 if newConfig.file: 

711 raise ValueError("Setting parameters section with config file is not supported") 

712 if newConfig.python: 

713 raise ValueError("Setting parameters section using python block in unsupported") 

714 return 

715 if label not in self._pipelineIR.tasks: 

716 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

717 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

718 

719 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

720 """Write the pipeline to a file or directory. 

721 

722 Parameters 

723 ---------- 

724 uri : convertible to `ResourcePath` 

725 URI to write to; may have any scheme with `ResourcePath` write 

726 support or no scheme for a local file/directory. Should have a 

727 ``.yaml``. 

728 """ 

729 self._pipelineIR.write_to_uri(uri) 

730 

731 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

732 """Returns a generator of TaskDefs which can be used to create quantum 

733 graphs. 

734 

735 Returns 

736 ------- 

737 generator : generator of `TaskDef` 

738 The generator returned will be the sorted iterator of tasks which 

739 are to be used in constructing a quantum graph. 

740 

741 Raises 

742 ------ 

743 NotImplementedError 

744 If a dataId is supplied in a config block. This is in place for 

745 future use 

746 """ 

747 taskDefs = [] 

748 for label in self._pipelineIR.tasks: 

749 taskDefs.append(self._buildTaskDef(label)) 

750 

751 # lets evaluate the contracts 

752 if self._pipelineIR.contracts is not None: 

753 label_to_config = {x.label: x.config for x in taskDefs} 

754 for contract in self._pipelineIR.contracts: 

755 # execute this in its own line so it can raise a good error 

756 # message if there was problems with the eval 

757 success = eval(contract.contract, None, label_to_config) 

758 if not success: 

759 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

760 raise pipelineIR.ContractError( 

761 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

762 ) 

763 

764 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

765 yield from pipeTools.orderPipeline(taskDefs) 

766 

767 def _buildTaskDef(self, label: str) -> TaskDef: 

768 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

769 raise NameError(f"Label {label} does not appear in this pipeline") 

770 taskClass: Type[PipelineTask] = doImportType(taskIR.klass) 

771 taskName = get_full_type_name(taskClass) 

772 config = taskClass.ConfigClass() 

773 instrument: PipeBaseInstrument | None = None 

774 if (instrumentName := self._pipelineIR.instrument) is not None: 

775 instrument_cls: type = doImportType(instrumentName) 

776 instrument = instrument_cls() 

777 config.applyConfigOverrides( 

778 instrument, 

779 getattr(taskClass, "_DefaultName", ""), 

780 taskIR.config, 

781 self._pipelineIR.parameters, 

782 label, 

783 ) 

784 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

785 

786 def __iter__(self) -> Generator[TaskDef, None, None]: 

787 return self.toExpandedPipeline() 

788 

789 def __getitem__(self, item: str) -> TaskDef: 

790 return self._buildTaskDef(item) 

791 

792 def __len__(self) -> int: 

793 return len(self._pipelineIR.tasks) 

794 

795 def __eq__(self, other: object) -> bool: 

796 if not isinstance(other, Pipeline): 

797 return False 

798 elif self._pipelineIR == other._pipelineIR: 

799 # Shortcut: if the IR is the same, the expanded pipeline must be 

800 # the same as well. But the converse is not true. 

801 return True 

802 else: 

803 self_expanded = {td.label: (td.taskClass,) for td in self} 

804 other_expanded = {td.label: (td.taskClass,) for td in other} 

805 if self_expanded != other_expanded: 

806 return False 

807 # After DM-27847, we should compare configuration here, or better, 

808 # delegated to TaskDef.__eq__ after making that compare configurations. 

809 raise NotImplementedError( 

810 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847." 

811 ) 

812 

813 

814@dataclass(frozen=True) 

815class TaskDatasetTypes: 

816 """An immutable struct that extracts and classifies the dataset types used 

817 by a `PipelineTask` 

818 """ 

819 

820 initInputs: NamedValueSet[DatasetType] 

821 """Dataset types that are needed as inputs in order to construct this Task. 

822 

823 Task-level `initInputs` may be classified as either 

824 `~PipelineDatasetTypes.initInputs` or 

825 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

826 """ 

827 

828 initOutputs: NamedValueSet[DatasetType] 

829 """Dataset types that may be written after constructing this Task. 

830 

831 Task-level `initOutputs` may be classified as either 

832 `~PipelineDatasetTypes.initOutputs` or 

833 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

834 """ 

835 

836 inputs: NamedValueSet[DatasetType] 

837 """Dataset types that are regular inputs to this Task. 

838 

839 If an input dataset needed for a Quantum cannot be found in the input 

840 collection(s) or produced by another Task in the Pipeline, that Quantum 

841 (and all dependent Quanta) will not be produced. 

842 

843 Task-level `inputs` may be classified as either 

844 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

845 at the Pipeline level. 

846 """ 

847 

848 queryConstraints: NamedValueSet[DatasetType] 

849 """Regular inputs that should not be used as constraints on the initial 

850 QuantumGraph generation data ID query, according to their tasks 

851 (`NamedValueSet`). 

852 """ 

853 

854 prerequisites: NamedValueSet[DatasetType] 

855 """Dataset types that are prerequisite inputs to this Task. 

856 

857 Prerequisite inputs must exist in the input collection(s) before the 

858 pipeline is run, but do not constrain the graph - if a prerequisite is 

859 missing for a Quantum, `PrerequisiteMissingError` is raised. 

860 

861 Prerequisite inputs are not resolved until the second stage of 

862 QuantumGraph generation. 

863 """ 

864 

865 outputs: NamedValueSet[DatasetType] 

866 """Dataset types that are produced by this Task. 

867 

868 Task-level `outputs` may be classified as either 

869 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

870 at the Pipeline level. 

871 """ 

872 

873 @classmethod 

874 def fromTaskDef( 

875 cls, 

876 taskDef: TaskDef, 

877 *, 

878 registry: Registry, 

879 include_configs: bool = True, 

880 storage_class_mapping: Optional[Mapping[str, str]] = None, 

881 ) -> TaskDatasetTypes: 

882 """Extract and classify the dataset types from a single `PipelineTask`. 

883 

884 Parameters 

885 ---------- 

886 taskDef: `TaskDef` 

887 An instance of a `TaskDef` class for a particular `PipelineTask`. 

888 registry: `Registry` 

889 Registry used to construct normalized `DatasetType` objects and 

890 retrieve those that are incomplete. 

891 include_configs : `bool`, optional 

892 If `True` (default) include config dataset types as 

893 ``initOutputs``. 

894 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

895 If a taskdef contains a component dataset type that is unknown 

896 to the registry, its parent StorageClass will be looked up in this 

897 mapping if it is supplied. If the mapping does not contain the 

898 composite dataset type, or the mapping is not supplied an exception 

899 will be raised. 

900 

901 Returns 

902 ------- 

903 types: `TaskDatasetTypes` 

904 The dataset types used by this task. 

905 

906 Raises 

907 ------ 

908 ValueError 

909 Raised if dataset type connection definition differs from 

910 registry definition. 

911 LookupError 

912 Raised if component parent StorageClass could not be determined 

913 and storage_class_mapping does not contain the composite type, or 

914 is set to None. 

915 """ 

916 

917 def makeDatasetTypesSet( 

918 connectionType: str, 

919 is_input: bool, 

920 freeze: bool = True, 

921 ) -> NamedValueSet[DatasetType]: 

922 """Constructs a set of true `DatasetType` objects 

923 

924 Parameters 

925 ---------- 

926 connectionType : `str` 

927 Name of the connection type to produce a set for, corresponds 

928 to an attribute of type `list` on the connection class instance 

929 is_input : `bool` 

930 These are input dataset types, else they are output dataset 

931 types. 

932 freeze : `bool`, optional 

933 If `True`, call `NamedValueSet.freeze` on the object returned. 

934 

935 Returns 

936 ------- 

937 datasetTypes : `NamedValueSet` 

938 A set of all datasetTypes which correspond to the input 

939 connection type specified in the connection class of this 

940 `PipelineTask` 

941 

942 Raises 

943 ------ 

944 ValueError 

945 Raised if dataset type connection definition differs from 

946 registry definition. 

947 LookupError 

948 Raised if component parent StorageClass could not be determined 

949 and storage_class_mapping does not contain the composite type, 

950 or is set to None. 

951 

952 Notes 

953 ----- 

954 This function is a closure over the variables ``registry`` and 

955 ``taskDef``, and ``storage_class_mapping``. 

956 """ 

957 datasetTypes = NamedValueSet[DatasetType]() 

958 for c in iterConnections(taskDef.connections, connectionType): 

959 dimensions = set(getattr(c, "dimensions", set())) 

960 if "skypix" in dimensions: 

961 try: 

962 datasetType = registry.getDatasetType(c.name) 

963 except LookupError as err: 

964 raise LookupError( 

965 f"DatasetType '{c.name}' referenced by " 

966 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

967 "placeholder, but does not already exist in the registry. " 

968 "Note that reference catalog names are now used as the dataset " 

969 "type name instead of 'ref_cat'." 

970 ) from err 

971 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

972 rest2 = set( 

973 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

974 ) 

975 if rest1 != rest2: 

976 raise ValueError( 

977 f"Non-skypix dimensions for dataset type {c.name} declared in " 

978 f"connections ({rest1}) are inconsistent with those in " 

979 f"registry's version of this dataset ({rest2})." 

980 ) 

981 else: 

982 # Component dataset types are not explicitly in the 

983 # registry. This complicates consistency checks with 

984 # registry and requires we work out the composite storage 

985 # class. 

986 registryDatasetType = None 

987 try: 

988 registryDatasetType = registry.getDatasetType(c.name) 

989 except KeyError: 

990 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

991 if componentName: 

992 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

993 raise LookupError( 

994 "Component parent class cannot be determined, and " 

995 "composite name was not in storage class mapping, or no " 

996 "storage_class_mapping was supplied" 

997 ) 

998 else: 

999 parentStorageClass = storage_class_mapping[compositeName] 

1000 else: 

1001 parentStorageClass = None 

1002 datasetType = c.makeDatasetType( 

1003 registry.dimensions, parentStorageClass=parentStorageClass 

1004 ) 

1005 registryDatasetType = datasetType 

1006 else: 

1007 datasetType = c.makeDatasetType( 

1008 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

1009 ) 

1010 

1011 if registryDatasetType and datasetType != registryDatasetType: 

1012 # The dataset types differ but first check to see if 

1013 # they are compatible before raising. 

1014 if is_input: 

1015 # This DatasetType must be compatible on get. 

1016 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

1017 else: 

1018 # Has to be able to be converted to expect type 

1019 # on put. 

1020 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

1021 if is_compatible: 

1022 # For inputs we want the pipeline to use the 

1023 # pipeline definition, for outputs it should use 

1024 # the registry definition. 

1025 if not is_input: 

1026 datasetType = registryDatasetType 

1027 _LOG.debug( 

1028 "Dataset types differ (task %s != registry %s) but are compatible" 

1029 " for %s in %s.", 

1030 datasetType, 

1031 registryDatasetType, 

1032 "input" if is_input else "output", 

1033 taskDef.label, 

1034 ) 

1035 else: 

1036 try: 

1037 # Explicitly check for storage class just to 

1038 # make more specific message. 

1039 _ = datasetType.storageClass 

1040 except KeyError: 

1041 raise ValueError( 

1042 "Storage class does not exist for supplied dataset type " 

1043 f"{datasetType} for {taskDef.label}." 

1044 ) from None 

1045 raise ValueError( 

1046 f"Supplied dataset type ({datasetType}) inconsistent with " 

1047 f"registry definition ({registryDatasetType}) " 

1048 f"for {taskDef.label}." 

1049 ) 

1050 datasetTypes.add(datasetType) 

1051 if freeze: 

1052 datasetTypes.freeze() 

1053 return datasetTypes 

1054 

1055 # optionally add initOutput dataset for config 

1056 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

1057 if include_configs: 

1058 initOutputs.add( 

1059 DatasetType( 

1060 taskDef.configDatasetName, 

1061 registry.dimensions.empty, 

1062 storageClass="Config", 

1063 ) 

1064 ) 

1065 initOutputs.freeze() 

1066 

1067 # optionally add output dataset for metadata 

1068 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

1069 if taskDef.metadataDatasetName is not None: 

1070 # Metadata is supposed to be of the TaskMetadata type, its 

1071 # dimensions correspond to a task quantum. 

1072 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1073 

1074 # Allow the storage class definition to be read from the existing 

1075 # dataset type definition if present. 

1076 try: 

1077 current = registry.getDatasetType(taskDef.metadataDatasetName) 

1078 except KeyError: 

1079 # No previous definition so use the default. 

1080 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

1081 else: 

1082 storageClass = current.storageClass.name 

1083 

1084 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

1085 if taskDef.logOutputDatasetName is not None: 

1086 # Log output dimensions correspond to a task quantum. 

1087 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1088 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}) 

1089 

1090 outputs.freeze() 

1091 

1092 inputs = makeDatasetTypesSet("inputs", is_input=True) 

1093 queryConstraints = NamedValueSet( 

1094 inputs[c.name] 

1095 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs")) 

1096 if not c.deferGraphConstraint 

1097 ) 

1098 

1099 return cls( 

1100 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

1101 initOutputs=initOutputs, 

1102 inputs=inputs, 

1103 queryConstraints=queryConstraints, 

1104 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

1105 outputs=outputs, 

1106 ) 

1107 

1108 

1109@dataclass(frozen=True) 

1110class PipelineDatasetTypes: 

1111 """An immutable struct that classifies the dataset types used in a 

1112 `Pipeline`. 

1113 """ 

1114 

1115 packagesDatasetName: ClassVar[str] = "packages" 

1116 """Name of a dataset type used to save package versions. 

1117 """ 

1118 

1119 initInputs: NamedValueSet[DatasetType] 

1120 """Dataset types that are needed as inputs in order to construct the Tasks 

1121 in this Pipeline. 

1122 

1123 This does not include dataset types that are produced when constructing 

1124 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1125 """ 

1126 

1127 initOutputs: NamedValueSet[DatasetType] 

1128 """Dataset types that may be written after constructing the Tasks in this 

1129 Pipeline. 

1130 

1131 This does not include dataset types that are also used as inputs when 

1132 constructing other Tasks in the Pipeline (these are classified as 

1133 `initIntermediates`). 

1134 """ 

1135 

1136 initIntermediates: NamedValueSet[DatasetType] 

1137 """Dataset types that are both used when constructing one or more Tasks 

1138 in the Pipeline and produced as a side-effect of constructing another 

1139 Task in the Pipeline. 

1140 """ 

1141 

1142 inputs: NamedValueSet[DatasetType] 

1143 """Dataset types that are regular inputs for the full pipeline. 

1144 

1145 If an input dataset needed for a Quantum cannot be found in the input 

1146 collection(s), that Quantum (and all dependent Quanta) will not be 

1147 produced. 

1148 """ 

1149 

1150 queryConstraints: NamedValueSet[DatasetType] 

1151 """Regular inputs that should be used as constraints on the initial 

1152 QuantumGraph generation data ID query, according to their tasks 

1153 (`NamedValueSet`). 

1154 """ 

1155 

1156 prerequisites: NamedValueSet[DatasetType] 

1157 """Dataset types that are prerequisite inputs for the full Pipeline. 

1158 

1159 Prerequisite inputs must exist in the input collection(s) before the 

1160 pipeline is run, but do not constrain the graph - if a prerequisite is 

1161 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1162 

1163 Prerequisite inputs are not resolved until the second stage of 

1164 QuantumGraph generation. 

1165 """ 

1166 

1167 intermediates: NamedValueSet[DatasetType] 

1168 """Dataset types that are output by one Task in the Pipeline and consumed 

1169 as inputs by one or more other Tasks in the Pipeline. 

1170 """ 

1171 

1172 outputs: NamedValueSet[DatasetType] 

1173 """Dataset types that are output by a Task in the Pipeline and not consumed 

1174 by any other Task in the Pipeline. 

1175 """ 

1176 

1177 byTask: Mapping[str, TaskDatasetTypes] 

1178 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1179 

1180 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1181 neither has been modified since the dataset types were extracted, of 

1182 course). 

1183 """ 

1184 

1185 @classmethod 

1186 def fromPipeline( 

1187 cls, 

1188 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1189 *, 

1190 registry: Registry, 

1191 include_configs: bool = True, 

1192 include_packages: bool = True, 

1193 ) -> PipelineDatasetTypes: 

1194 """Extract and classify the dataset types from all tasks in a 

1195 `Pipeline`. 

1196 

1197 Parameters 

1198 ---------- 

1199 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1200 A collection of tasks that can be run together. 

1201 registry: `Registry` 

1202 Registry used to construct normalized `DatasetType` objects and 

1203 retrieve those that are incomplete. 

1204 include_configs : `bool`, optional 

1205 If `True` (default) include config dataset types as 

1206 ``initOutputs``. 

1207 include_packages : `bool`, optional 

1208 If `True` (default) include the dataset type for software package 

1209 versions in ``initOutputs``. 

1210 

1211 Returns 

1212 ------- 

1213 types: `PipelineDatasetTypes` 

1214 The dataset types used by this `Pipeline`. 

1215 

1216 Raises 

1217 ------ 

1218 ValueError 

1219 Raised if Tasks are inconsistent about which datasets are marked 

1220 prerequisite. This indicates that the Tasks cannot be run as part 

1221 of the same `Pipeline`. 

1222 """ 

1223 allInputs = NamedValueSet[DatasetType]() 

1224 allOutputs = NamedValueSet[DatasetType]() 

1225 allInitInputs = NamedValueSet[DatasetType]() 

1226 allInitOutputs = NamedValueSet[DatasetType]() 

1227 prerequisites = NamedValueSet[DatasetType]() 

1228 queryConstraints = NamedValueSet[DatasetType]() 

1229 byTask = dict() 

1230 if include_packages: 

1231 allInitOutputs.add( 

1232 DatasetType( 

1233 cls.packagesDatasetName, 

1234 registry.dimensions.empty, 

1235 storageClass="Packages", 

1236 ) 

1237 ) 

1238 # create a list of TaskDefs in case the input is a generator 

1239 pipeline = list(pipeline) 

1240 

1241 # collect all the output dataset types 

1242 typeStorageclassMap: Dict[str, str] = {} 

1243 for taskDef in pipeline: 

1244 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1245 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1246 

1247 for taskDef in pipeline: 

1248 thisTask = TaskDatasetTypes.fromTaskDef( 

1249 taskDef, 

1250 registry=registry, 

1251 include_configs=include_configs, 

1252 storage_class_mapping=typeStorageclassMap, 

1253 ) 

1254 allInitInputs.update(thisTask.initInputs) 

1255 allInitOutputs.update(thisTask.initOutputs) 

1256 allInputs.update(thisTask.inputs) 

1257 # Inputs are query constraints if any task considers them a query 

1258 # constraint. 

1259 queryConstraints.update(thisTask.queryConstraints) 

1260 prerequisites.update(thisTask.prerequisites) 

1261 allOutputs.update(thisTask.outputs) 

1262 byTask[taskDef.label] = thisTask 

1263 if not prerequisites.isdisjoint(allInputs): 

1264 raise ValueError( 

1265 "{} marked as both prerequisites and regular inputs".format( 

1266 {dt.name for dt in allInputs & prerequisites} 

1267 ) 

1268 ) 

1269 if not prerequisites.isdisjoint(allOutputs): 

1270 raise ValueError( 

1271 "{} marked as both prerequisites and outputs".format( 

1272 {dt.name for dt in allOutputs & prerequisites} 

1273 ) 

1274 ) 

1275 # Make sure that components which are marked as inputs get treated as 

1276 # intermediates if there is an output which produces the composite 

1277 # containing the component 

1278 intermediateComponents = NamedValueSet[DatasetType]() 

1279 intermediateComposites = NamedValueSet[DatasetType]() 

1280 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1281 for dsType in allInputs: 

1282 # get the name of a possible component 

1283 name, component = dsType.nameAndComponent() 

1284 # if there is a component name, that means this is a component 

1285 # DatasetType, if there is an output which produces the parent of 

1286 # this component, treat this input as an intermediate 

1287 if component is not None: 

1288 # This needs to be in this if block, because someone might have 

1289 # a composite that is a pure input from existing data 

1290 if name in outputNameMapping: 

1291 intermediateComponents.add(dsType) 

1292 intermediateComposites.add(outputNameMapping[name]) 

1293 

1294 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1295 common = a.names & b.names 

1296 for name in common: 

1297 # Any compatibility is allowed. This function does not know 

1298 # if a dataset type is to be used for input or output. 

1299 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1300 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1301 

1302 checkConsistency(allInitInputs, allInitOutputs) 

1303 checkConsistency(allInputs, allOutputs) 

1304 checkConsistency(allInputs, intermediateComposites) 

1305 checkConsistency(allOutputs, intermediateComposites) 

1306 

1307 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]: 

1308 assert isinstance(s, NamedValueSet) 

1309 s.freeze() 

1310 return s 

1311 

1312 inputs = frozen(allInputs - allOutputs - intermediateComponents) 

1313 

1314 return cls( 

1315 initInputs=frozen(allInitInputs - allInitOutputs), 

1316 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1317 initOutputs=frozen(allInitOutputs - allInitInputs), 

1318 inputs=inputs, 

1319 queryConstraints=frozen(queryConstraints & inputs), 

1320 # If there are storage class differences in inputs and outputs 

1321 # the intermediates have to choose priority. Here choose that 

1322 # inputs to tasks much match the requested storage class by 

1323 # applying the inputs over the top of the outputs. 

1324 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1325 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1326 prerequisites=frozen(prerequisites), 

1327 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1328 ) 

1329 

1330 @classmethod 

1331 def initOutputNames( 

1332 cls, 

1333 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1334 *, 

1335 include_configs: bool = True, 

1336 include_packages: bool = True, 

1337 ) -> Iterator[str]: 

1338 """Return the names of dataset types ot task initOutputs, Configs, 

1339 and package versions for a pipeline. 

1340 

1341 Parameters 

1342 ---------- 

1343 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1344 A `Pipeline` instance or collection of `TaskDef` instances. 

1345 include_configs : `bool`, optional 

1346 If `True` (default) include config dataset types. 

1347 include_packages : `bool`, optional 

1348 If `True` (default) include the dataset type for package versions. 

1349 

1350 Yields 

1351 ------ 

1352 datasetTypeName : `str` 

1353 Name of the dataset type. 

1354 """ 

1355 if include_packages: 

1356 # Package versions dataset type 

1357 yield cls.packagesDatasetName 

1358 

1359 if isinstance(pipeline, Pipeline): 

1360 pipeline = pipeline.toExpandedPipeline() 

1361 

1362 for taskDef in pipeline: 

1363 # all task InitOutputs 

1364 for name in taskDef.connections.initOutputs: 

1365 attribute = getattr(taskDef.connections, name) 

1366 yield attribute.name 

1367 

1368 # config dataset name 

1369 if include_configs: 

1370 yield taskDef.configDatasetName