Coverage for python/lsst/pipe/base/pipeline.py: 21%

439 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 11:14 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining Pipeline class and related methods. 

23""" 

24 

25from __future__ import annotations 

26 

27__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

28 

29import copy 

30import logging 

31import re 

32import urllib.parse 

33 

34# ------------------------------- 

35# Imports of standard modules -- 

36# ------------------------------- 

37from collections.abc import Callable, Generator, Iterable, Iterator, Mapping, Set 

38from dataclasses import dataclass 

39from types import MappingProxyType 

40from typing import TYPE_CHECKING, ClassVar, cast 

41 

42# ----------------------------- 

43# Imports for other modules -- 

44from lsst.daf.butler import ( 

45 DataCoordinate, 

46 DatasetType, 

47 DimensionUniverse, 

48 NamedValueSet, 

49 Registry, 

50 SkyPixDimension, 

51) 

52from lsst.resources import ResourcePath, ResourcePathExpression 

53from lsst.utils import doImportType 

54from lsst.utils.introspection import get_full_type_name 

55 

56from . import automatic_connection_constants as acc 

57from . import pipelineIR, pipeTools 

58from ._instrument import Instrument as PipeBaseInstrument 

59from ._task_metadata import TaskMetadata 

60from .config import PipelineTaskConfig 

61from .connections import iterConnections 

62from .connectionTypes import Input 

63from .pipelineTask import PipelineTask 

64from .task import _TASK_METADATA_TYPE 

65 

66if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

67 from lsst.obs.base import Instrument 

68 from lsst.pex.config import Config 

69 

70# ---------------------------------- 

71# Local non-exported definitions -- 

72# ---------------------------------- 

73 

74_LOG = logging.getLogger(__name__) 

75 

76# ------------------------ 

77# Exported definitions -- 

78# ------------------------ 

79 

80 

81@dataclass 

82class LabelSpecifier: 

83 """A structure to specify a subset of labels to load 

84 

85 This structure may contain a set of labels to be used in subsetting a 

86 pipeline, or a beginning and end point. Beginning or end may be empty, 

87 in which case the range will be a half open interval. Unlike python 

88 iteration bounds, end bounds are *INCLUDED*. Note that range based 

89 selection is not well defined for pipelines that are not linear in nature, 

90 and correct behavior is not guaranteed, or may vary from run to run. 

91 """ 

92 

93 labels: set[str] | None = None 

94 begin: str | None = None 

95 end: str | None = None 

96 

97 def __post_init__(self) -> None: 

98 if self.labels is not None and (self.begin or self.end): 

99 raise ValueError( 

100 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

101 ) 

102 

103 

104class TaskDef: 

105 """TaskDef is a collection of information about task needed by Pipeline. 

106 

107 The information includes task name, configuration object and optional 

108 task class. This class is just a collection of attributes and it exposes 

109 all of them so that attributes could potentially be modified in place 

110 (e.g. if configuration needs extra overrides). 

111 

112 Attributes 

113 ---------- 

114 taskName : `str`, optional 

115 The fully-qualified `PipelineTask` class name. If not provided, 

116 ``taskClass`` must be. 

117 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional 

118 Instance of the configuration class corresponding to this task class, 

119 usually with all overrides applied. This config will be frozen. If 

120 not provided, ``taskClass`` must be provided and 

121 ``taskClass.ConfigClass()`` will be used. 

122 taskClass : `type`, optional 

123 `PipelineTask` class object; if provided and ``taskName`` is as well, 

124 the caller guarantees that they are consistent. If not provided, 

125 ``taskName`` is used to import the type. 

126 label : `str`, optional 

127 Task label, usually a short string unique in a pipeline. If not 

128 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

129 be used. 

130 """ 

131 

132 def __init__( 

133 self, 

134 taskName: str | None = None, 

135 config: PipelineTaskConfig | None = None, 

136 taskClass: type[PipelineTask] | None = None, 

137 label: str | None = None, 

138 ): 

139 if taskName is None: 

140 if taskClass is None: 

141 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

142 taskName = get_full_type_name(taskClass) 

143 elif taskClass is None: 

144 taskClass = doImportType(taskName) 

145 if config is None: 

146 if taskClass is None: 

147 raise ValueError("`taskClass` must be provided if `config` is not.") 

148 config = taskClass.ConfigClass() 

149 if label is None: 

150 if taskClass is None: 

151 raise ValueError("`taskClass` must be provided if `label` is not.") 

152 label = taskClass._DefaultName 

153 self.taskName = taskName 

154 try: 

155 config.validate() 

156 except Exception: 

157 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

158 raise 

159 config.freeze() 

160 self.config = config 

161 self.taskClass = taskClass 

162 self.label = label 

163 self.connections = config.connections.ConnectionsClass(config=config) 

164 

165 @property 

166 def configDatasetName(self) -> str: 

167 """Name of a dataset type for configuration of this task (`str`)""" 

168 return acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.label) 

169 

170 @property 

171 def metadataDatasetName(self) -> str: 

172 """Name of a dataset type for metadata of this task (`str`)""" 

173 return self.makeMetadataDatasetName(self.label) 

174 

175 @classmethod 

176 def makeMetadataDatasetName(cls, label: str) -> str: 

177 """Construct the name of the dataset type for metadata for a task. 

178 

179 Parameters 

180 ---------- 

181 label : `str` 

182 Label for the task within its pipeline. 

183 

184 Returns 

185 ------- 

186 name : `str` 

187 Name of the task's metadata dataset type. 

188 """ 

189 return acc.METADATA_OUTPUT_TEMPLATE.format(label=label) 

190 

191 @property 

192 def logOutputDatasetName(self) -> str | None: 

193 """Name of a dataset type for log output from this task, `None` if 

194 logs are not to be saved (`str`) 

195 """ 

196 if self.config.saveLogOutput: 

197 return acc.LOG_OUTPUT_TEMPLATE.format(label=self.label) 

198 else: 

199 return None 

200 

201 def __str__(self) -> str: 

202 rep = "TaskDef(" + self.taskName 

203 if self.label: 

204 rep += ", label=" + self.label 

205 rep += ")" 

206 return rep 

207 

208 def __eq__(self, other: object) -> bool: 

209 if not isinstance(other, TaskDef): 

210 return False 

211 # This does not consider equality of configs when determining equality 

212 # as config equality is a difficult thing to define. Should be updated 

213 # after DM-27847 

214 return self.taskClass == other.taskClass and self.label == other.label 

215 

216 def __hash__(self) -> int: 

217 return hash((self.taskClass, self.label)) 

218 

219 @classmethod 

220 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef: 

221 """Unpickle pickle. Custom callable for unpickling. 

222 

223 All arguments are forwarded directly to the constructor; this 

224 trampoline is only needed because ``__reduce__`` callables can't be 

225 called with keyword arguments. 

226 """ 

227 return cls(taskName=taskName, config=config, label=label) 

228 

229 def __reduce__(self) -> tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], tuple[str, Config, str]]: 

230 return (self._unreduce, (self.taskName, self.config, self.label)) 

231 

232 

233class Pipeline: 

234 """A `Pipeline` is a representation of a series of tasks to run, and the 

235 configuration for those tasks. 

236 

237 Parameters 

238 ---------- 

239 description : `str` 

240 A description of that this pipeline does. 

241 """ 

242 

243 def __init__(self, description: str): 

244 pipeline_dict = {"description": description, "tasks": {}} 

245 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

246 

247 @classmethod 

248 def fromFile(cls, filename: str) -> Pipeline: 

249 """Load a pipeline defined in a pipeline yaml file. 

250 

251 Parameters 

252 ---------- 

253 filename: `str` 

254 A path that points to a pipeline defined in yaml format. This 

255 filename may also supply additional labels to be used in 

256 subsetting the loaded Pipeline. These labels are separated from 

257 the path by a ``#``, and may be specified as a comma separated 

258 list, or a range denoted as beginning..end. Beginning or end may 

259 be empty, in which case the range will be a half open interval. 

260 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

261 that range based selection is not well defined for pipelines that 

262 are not linear in nature, and correct behavior is not guaranteed, 

263 or may vary from run to run. 

264 

265 Returns 

266 ------- 

267 pipeline: `Pipeline` 

268 The pipeline loaded from specified location with appropriate (if 

269 any) subsetting. 

270 

271 Notes 

272 ----- 

273 This method attempts to prune any contracts that contain labels which 

274 are not in the declared subset of labels. This pruning is done using a 

275 string based matching due to the nature of contracts and may prune more 

276 than it should. 

277 """ 

278 return cls.from_uri(filename) 

279 

280 @classmethod 

281 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

282 """Load a pipeline defined in a pipeline yaml file at a location 

283 specified by a URI. 

284 

285 Parameters 

286 ---------- 

287 uri : convertible to `~lsst.resources.ResourcePath` 

288 If a string is supplied this should be a URI path that points to a 

289 pipeline defined in yaml format, either as a direct path to the 

290 yaml file, or as a directory containing a ``pipeline.yaml`` file 

291 the form used by `write_to_uri` with ``expand=True``). This uri may 

292 also supply additional labels to be used in subsetting the loaded 

293 `Pipeline`. These labels are separated from the path by a ``#``, 

294 and may be specified as a comma separated list, or a range denoted 

295 as beginning..end. Beginning or end may be empty, in which case the 

296 range will be a half open interval. Unlike python iteration bounds, 

297 end bounds are *INCLUDED*. Note that range based selection is not 

298 well defined for pipelines that are not linear in nature, and 

299 correct behavior is not guaranteed, or may vary from run to run. 

300 The same specifiers can be used with a 

301 `~lsst.resources.ResourcePath` object, by being the sole contents 

302 in the fragments attribute. 

303 

304 Returns 

305 ------- 

306 pipeline : `Pipeline` 

307 The pipeline loaded from specified location with appropriate (if 

308 any) subsetting. 

309 

310 Notes 

311 ----- 

312 This method attempts to prune any contracts that contain labels which 

313 are not in the declared subset of labels. This pruning is done using a 

314 string based matching due to the nature of contracts and may prune more 

315 than it should. 

316 """ 

317 # Split up the uri and any labels that were supplied 

318 uri, label_specifier = cls._parse_file_specifier(uri) 

319 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

320 

321 # If there are labels supplied, only keep those 

322 if label_specifier is not None: 

323 pipeline = pipeline.subsetFromLabels(label_specifier) 

324 return pipeline 

325 

326 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

327 """Subset a pipeline to contain only labels specified in labelSpecifier 

328 

329 Parameters 

330 ---------- 

331 labelSpecifier : `labelSpecifier` 

332 Object containing labels that describes how to subset a pipeline. 

333 

334 Returns 

335 ------- 

336 pipeline : `Pipeline` 

337 A new pipeline object that is a subset of the old pipeline 

338 

339 Raises 

340 ------ 

341 ValueError 

342 Raised if there is an issue with specified labels 

343 

344 Notes 

345 ----- 

346 This method attempts to prune any contracts that contain labels which 

347 are not in the declared subset of labels. This pruning is done using a 

348 string based matching due to the nature of contracts and may prune more 

349 than it should. 

350 """ 

351 # Labels supplied as a set 

352 if labelSpecifier.labels: 

353 labelSet = labelSpecifier.labels 

354 # Labels supplied as a range, first create a list of all the labels 

355 # in the pipeline sorted according to task dependency. Then only 

356 # keep labels that lie between the supplied bounds 

357 else: 

358 # Create a copy of the pipeline to use when assessing the label 

359 # ordering. Use a dict for fast searching while preserving order. 

360 # Remove contracts so they do not fail in the expansion step. This 

361 # is needed because a user may only configure the tasks they intend 

362 # to run, which may cause some contracts to fail if they will later 

363 # be dropped 

364 pipeline = copy.deepcopy(self) 

365 pipeline._pipelineIR.contracts = [] 

366 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

367 

368 # Verify the bounds are in the labels 

369 if labelSpecifier.begin is not None: 

370 if labelSpecifier.begin not in labels: 

371 raise ValueError( 

372 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition" 

373 ) 

374 if labelSpecifier.end is not None: 

375 if labelSpecifier.end not in labels: 

376 raise ValueError( 

377 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

378 ) 

379 

380 labelSet = set() 

381 for label in labels: 

382 if labelSpecifier.begin is not None: 

383 if label != labelSpecifier.begin: 

384 continue 

385 else: 

386 labelSpecifier.begin = None 

387 labelSet.add(label) 

388 if labelSpecifier.end is not None and label == labelSpecifier.end: 

389 break 

390 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

391 

392 @staticmethod 

393 def _parse_file_specifier(uri: ResourcePathExpression) -> tuple[ResourcePath, LabelSpecifier | None]: 

394 """Split appart a uri and any possible label subsets""" 

395 if isinstance(uri, str): 

396 # This is to support legacy pipelines during transition 

397 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

398 if num_replace: 

399 raise ValueError( 

400 f"The pipeline file {uri} seems to use the legacy :" 

401 " to separate labels, please use # instead." 

402 ) 

403 if uri.count("#") > 1: 

404 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

405 # Everything else can be converted directly to ResourcePath. 

406 uri = ResourcePath(uri) 

407 label_subset = uri.fragment or None 

408 

409 specifier: LabelSpecifier | None 

410 if label_subset is not None: 

411 label_subset = urllib.parse.unquote(label_subset) 

412 args: dict[str, set[str] | str | None] 

413 # labels supplied as a list 

414 if "," in label_subset: 

415 if ".." in label_subset: 

416 raise ValueError( 

417 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

418 ) 

419 args = {"labels": set(label_subset.split(","))} 

420 # labels supplied as a range 

421 elif ".." in label_subset: 

422 # Try to de-structure the labelSubset, this will fail if more 

423 # than one range is specified 

424 begin, end, *rest = label_subset.split("..") 

425 if rest: 

426 raise ValueError("Only one range can be specified when loading a pipeline") 

427 args = {"begin": begin if begin else None, "end": end if end else None} 

428 # Assume anything else is a single label 

429 else: 

430 args = {"labels": {label_subset}} 

431 

432 # MyPy doesn't like how cavalier kwarg construction is with types. 

433 specifier = LabelSpecifier(**args) # type: ignore 

434 else: 

435 specifier = None 

436 

437 return uri, specifier 

438 

439 @classmethod 

440 def fromString(cls, pipeline_string: str) -> Pipeline: 

441 """Create a pipeline from string formatted as a pipeline document. 

442 

443 Parameters 

444 ---------- 

445 pipeline_string : `str` 

446 A string that is formatted according like a pipeline document 

447 

448 Returns 

449 ------- 

450 pipeline: `Pipeline` 

451 """ 

452 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

453 return pipeline 

454 

455 @classmethod 

456 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

457 """Create a pipeline from an already created `PipelineIR` object. 

458 

459 Parameters 

460 ---------- 

461 deserialized_pipeline: `PipelineIR` 

462 An already created pipeline intermediate representation object 

463 

464 Returns 

465 ------- 

466 pipeline: `Pipeline` 

467 """ 

468 pipeline = cls.__new__(cls) 

469 pipeline._pipelineIR = deserialized_pipeline 

470 return pipeline 

471 

472 @classmethod 

473 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

474 """Create a new pipeline by copying an already existing `Pipeline`. 

475 

476 Parameters 

477 ---------- 

478 pipeline: `Pipeline` 

479 An already created pipeline intermediate representation object 

480 

481 Returns 

482 ------- 

483 pipeline: `Pipeline` 

484 """ 

485 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

486 

487 def __str__(self) -> str: 

488 return str(self._pipelineIR) 

489 

490 def mergePipeline(self, pipeline: Pipeline) -> None: 

491 """Merge another in-memory `Pipeline` object into this one. 

492 

493 This merges another pipeline into this object, as if it were declared 

494 in the import block of the yaml definition of this pipeline. This 

495 modifies this pipeline in place. 

496 

497 Parameters 

498 ---------- 

499 pipeline : `Pipeline` 

500 The `Pipeline` object that is to be merged into this object. 

501 """ 

502 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,)) 

503 

504 def addLabelToSubset(self, subset: str, label: str) -> None: 

505 """Add a task label from the specified subset. 

506 

507 Parameters 

508 ---------- 

509 subset : `str` 

510 The labeled subset to modify 

511 label : `str` 

512 The task label to add to the specified subset. 

513 

514 Raises 

515 ------ 

516 ValueError 

517 Raised if the specified subset does not exist within the pipeline. 

518 Raised if the specified label does not exist within the pipeline. 

519 """ 

520 if label not in self._pipelineIR.tasks: 

521 raise ValueError(f"Label {label} does not appear within the pipeline") 

522 if subset not in self._pipelineIR.labeled_subsets: 

523 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

524 self._pipelineIR.labeled_subsets[subset].subset.add(label) 

525 

526 def removeLabelFromSubset(self, subset: str, label: str) -> None: 

527 """Remove a task label from the specified subset. 

528 

529 Parameters 

530 ---------- 

531 subset : `str` 

532 The labeled subset to modify 

533 label : `str` 

534 The task label to remove from the specified subset. 

535 

536 Raises 

537 ------ 

538 ValueError 

539 Raised if the specified subset does not exist in the pipeline. 

540 Raised if the specified label does not exist within the specified 

541 subset. 

542 """ 

543 if subset not in self._pipelineIR.labeled_subsets: 

544 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

545 if label not in self._pipelineIR.labeled_subsets[subset].subset: 

546 raise ValueError(f"Label {label} does not appear within the pipeline") 

547 self._pipelineIR.labeled_subsets[subset].subset.remove(label) 

548 

549 def findSubsetsWithLabel(self, label: str) -> set[str]: 

550 """Find any subsets which may contain the specified label. 

551 

552 This function returns the name of subsets which return the specified 

553 label. May return an empty set if there are no subsets, or no subsets 

554 containing the specified label. 

555 

556 Parameters 

557 ---------- 

558 label : `str` 

559 The task label to use in membership check 

560 

561 Returns 

562 ------- 

563 subsets : `set` of `str` 

564 Returns a set (possibly empty) of subsets names which contain the 

565 specified label. 

566 

567 Raises 

568 ------ 

569 ValueError 

570 Raised if the specified label does not exist within this pipeline. 

571 """ 

572 results = set() 

573 if label not in self._pipelineIR.tasks: 

574 raise ValueError(f"Label {label} does not appear within the pipeline") 

575 for subset in self._pipelineIR.labeled_subsets.values(): 

576 if label in subset.subset: 

577 results.add(subset.label) 

578 return results 

579 

580 def addInstrument(self, instrument: Instrument | str) -> None: 

581 """Add an instrument to the pipeline, or replace an instrument that is 

582 already defined. 

583 

584 Parameters 

585 ---------- 

586 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

587 Either a derived class object of a `lsst.daf.butler.instrument` or 

588 a string corresponding to a fully qualified 

589 `lsst.daf.butler.instrument` name. 

590 """ 

591 if isinstance(instrument, str): 

592 pass 

593 else: 

594 # TODO: assume that this is a subclass of Instrument, no type 

595 # checking 

596 instrument = get_full_type_name(instrument) 

597 self._pipelineIR.instrument = instrument 

598 

599 def getInstrument(self) -> str | None: 

600 """Get the instrument from the pipeline. 

601 

602 Returns 

603 ------- 

604 instrument : `str`, or None 

605 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

606 name, or None if the pipeline does not have an instrument. 

607 """ 

608 return self._pipelineIR.instrument 

609 

610 def get_data_id(self, universe: DimensionUniverse) -> DataCoordinate: 

611 """Return a data ID with all dimension constraints embedded in the 

612 pipeline. 

613 

614 Parameters 

615 ---------- 

616 universe : `lsst.daf.butler.DimensionUniverse` 

617 Object that defines all dimensions. 

618 

619 Returns 

620 ------- 

621 data_id : `lsst.daf.butler.DataCoordinate` 

622 Data ID with all dimension constraints embedded in the 

623 pipeline. 

624 """ 

625 instrument_class_name = self._pipelineIR.instrument 

626 if instrument_class_name is not None: 

627 instrument_class = cast(PipeBaseInstrument, doImportType(instrument_class_name)) 

628 if instrument_class is not None: 

629 return DataCoordinate.standardize(instrument=instrument_class.getName(), universe=universe) 

630 return DataCoordinate.makeEmpty(universe) 

631 

632 def addTask(self, task: type[PipelineTask] | str, label: str) -> None: 

633 """Add a new task to the pipeline, or replace a task that is already 

634 associated with the supplied label. 

635 

636 Parameters 

637 ---------- 

638 task: `PipelineTask` or `str` 

639 Either a derived class object of a `PipelineTask` or a string 

640 corresponding to a fully qualified `PipelineTask` name. 

641 label: `str` 

642 A label that is used to identify the `PipelineTask` being added 

643 """ 

644 if isinstance(task, str): 

645 taskName = task 

646 elif issubclass(task, PipelineTask): 

647 taskName = get_full_type_name(task) 

648 else: 

649 raise ValueError( 

650 "task must be either a child class of PipelineTask or a string containing" 

651 " a fully qualified name to one" 

652 ) 

653 if not label: 

654 # in some cases (with command line-generated pipeline) tasks can 

655 # be defined without label which is not acceptable, use task 

656 # _DefaultName in that case 

657 if isinstance(task, str): 

658 task_class = cast(PipelineTask, doImportType(task)) 

659 label = task_class._DefaultName 

660 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

661 

662 def removeTask(self, label: str) -> None: 

663 """Remove a task from the pipeline. 

664 

665 Parameters 

666 ---------- 

667 label : `str` 

668 The label used to identify the task that is to be removed 

669 

670 Raises 

671 ------ 

672 KeyError 

673 If no task with that label exists in the pipeline 

674 

675 """ 

676 self._pipelineIR.tasks.pop(label) 

677 

678 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

679 """Apply single config override. 

680 

681 Parameters 

682 ---------- 

683 label : `str` 

684 Label of the task. 

685 key: `str` 

686 Fully-qualified field name. 

687 value : object 

688 Value to be given to a field. 

689 """ 

690 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

691 

692 def addConfigFile(self, label: str, filename: str) -> None: 

693 """Add overrides from a specified file. 

694 

695 Parameters 

696 ---------- 

697 label : `str` 

698 The label used to identify the task associated with config to 

699 modify 

700 filename : `str` 

701 Path to the override file. 

702 """ 

703 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

704 

705 def addConfigPython(self, label: str, pythonString: str) -> None: 

706 """Add Overrides by running a snippet of python code against a config. 

707 

708 Parameters 

709 ---------- 

710 label : `str` 

711 The label used to identity the task associated with config to 

712 modify. 

713 pythonString: `str` 

714 A string which is valid python code to be executed. This is done 

715 with config as the only local accessible value. 

716 """ 

717 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

718 

719 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

720 if label == "parameters": 

721 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

722 if newConfig.file: 

723 raise ValueError("Setting parameters section with config file is not supported") 

724 if newConfig.python: 

725 raise ValueError("Setting parameters section using python block in unsupported") 

726 return 

727 if label not in self._pipelineIR.tasks: 

728 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

729 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

730 

731 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

732 """Write the pipeline to a file or directory. 

733 

734 Parameters 

735 ---------- 

736 uri : convertible to `~lsst.resources.ResourcePath` 

737 URI to write to; may have any scheme with 

738 `~lsst.resources.ResourcePath` write support or no scheme for a 

739 local file/directory. Should have a ``.yaml`` extension. 

740 """ 

741 self._pipelineIR.write_to_uri(uri) 

742 

743 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

744 r"""Return a generator of `TaskDef`\s which can be used to create 

745 quantum graphs. 

746 

747 Returns 

748 ------- 

749 generator : generator of `TaskDef` 

750 The generator returned will be the sorted iterator of tasks which 

751 are to be used in constructing a quantum graph. 

752 

753 Raises 

754 ------ 

755 NotImplementedError 

756 If a dataId is supplied in a config block. This is in place for 

757 future use 

758 """ 

759 taskDefs = [] 

760 for label in self._pipelineIR.tasks: 

761 taskDefs.append(self._buildTaskDef(label)) 

762 

763 # lets evaluate the contracts 

764 if self._pipelineIR.contracts is not None: 

765 label_to_config = {x.label: x.config for x in taskDefs} 

766 for contract in self._pipelineIR.contracts: 

767 # execute this in its own line so it can raise a good error 

768 # message if there was problems with the eval 

769 success = eval(contract.contract, None, label_to_config) 

770 if not success: 

771 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

772 raise pipelineIR.ContractError( 

773 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

774 ) 

775 

776 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

777 yield from pipeTools.orderPipeline(taskDefs) 

778 

779 def _buildTaskDef(self, label: str) -> TaskDef: 

780 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

781 raise NameError(f"Label {label} does not appear in this pipeline") 

782 taskClass: type[PipelineTask] = doImportType(taskIR.klass) 

783 taskName = get_full_type_name(taskClass) 

784 config = taskClass.ConfigClass() 

785 instrument: PipeBaseInstrument | None = None 

786 if (instrumentName := self._pipelineIR.instrument) is not None: 

787 instrument_cls: type = doImportType(instrumentName) 

788 instrument = instrument_cls() 

789 config.applyConfigOverrides( 

790 instrument, 

791 getattr(taskClass, "_DefaultName", ""), 

792 taskIR.config, 

793 self._pipelineIR.parameters, 

794 label, 

795 ) 

796 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

797 

798 def __iter__(self) -> Generator[TaskDef, None, None]: 

799 return self.toExpandedPipeline() 

800 

801 def __getitem__(self, item: str) -> TaskDef: 

802 return self._buildTaskDef(item) 

803 

804 def __len__(self) -> int: 

805 return len(self._pipelineIR.tasks) 

806 

807 def __eq__(self, other: object) -> bool: 

808 if not isinstance(other, Pipeline): 

809 return False 

810 elif self._pipelineIR == other._pipelineIR: 

811 # Shortcut: if the IR is the same, the expanded pipeline must be 

812 # the same as well. But the converse is not true. 

813 return True 

814 else: 

815 self_expanded = {td.label: (td.taskClass,) for td in self} 

816 other_expanded = {td.label: (td.taskClass,) for td in other} 

817 if self_expanded != other_expanded: 

818 return False 

819 # After DM-27847, we should compare configuration here, or better, 

820 # delegated to TaskDef.__eq__ after making that compare configurations. 

821 raise NotImplementedError( 

822 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847." 

823 ) 

824 

825 

826@dataclass(frozen=True) 

827class TaskDatasetTypes: 

828 """An immutable struct that extracts and classifies the dataset types used 

829 by a `PipelineTask` 

830 """ 

831 

832 initInputs: NamedValueSet[DatasetType] 

833 """Dataset types that are needed as inputs in order to construct this Task. 

834 

835 Task-level `initInputs` may be classified as either 

836 `~PipelineDatasetTypes.initInputs` or 

837 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

838 """ 

839 

840 initOutputs: NamedValueSet[DatasetType] 

841 """Dataset types that may be written after constructing this Task. 

842 

843 Task-level `initOutputs` may be classified as either 

844 `~PipelineDatasetTypes.initOutputs` or 

845 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

846 """ 

847 

848 inputs: NamedValueSet[DatasetType] 

849 """Dataset types that are regular inputs to this Task. 

850 

851 If an input dataset needed for a Quantum cannot be found in the input 

852 collection(s) or produced by another Task in the Pipeline, that Quantum 

853 (and all dependent Quanta) will not be produced. 

854 

855 Task-level `inputs` may be classified as either 

856 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

857 at the Pipeline level. 

858 """ 

859 

860 queryConstraints: NamedValueSet[DatasetType] 

861 """Regular inputs that should not be used as constraints on the initial 

862 QuantumGraph generation data ID query, according to their tasks 

863 (`NamedValueSet`). 

864 """ 

865 

866 prerequisites: NamedValueSet[DatasetType] 

867 """Dataset types that are prerequisite inputs to this Task. 

868 

869 Prerequisite inputs must exist in the input collection(s) before the 

870 pipeline is run, but do not constrain the graph - if a prerequisite is 

871 missing for a Quantum, `PrerequisiteMissingError` is raised. 

872 

873 Prerequisite inputs are not resolved until the second stage of 

874 QuantumGraph generation. 

875 """ 

876 

877 outputs: NamedValueSet[DatasetType] 

878 """Dataset types that are produced by this Task. 

879 

880 Task-level `outputs` may be classified as either 

881 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

882 at the Pipeline level. 

883 """ 

884 

885 @classmethod 

886 def fromTaskDef( 

887 cls, 

888 taskDef: TaskDef, 

889 *, 

890 registry: Registry, 

891 include_configs: bool = True, 

892 storage_class_mapping: Mapping[str, str] | None = None, 

893 ) -> TaskDatasetTypes: 

894 """Extract and classify the dataset types from a single `PipelineTask`. 

895 

896 Parameters 

897 ---------- 

898 taskDef: `TaskDef` 

899 An instance of a `TaskDef` class for a particular `PipelineTask`. 

900 registry: `Registry` 

901 Registry used to construct normalized 

902 `~lsst.daf.butler.DatasetType` objects and retrieve those that are 

903 incomplete. 

904 include_configs : `bool`, optional 

905 If `True` (default) include config dataset types as 

906 ``initOutputs``. 

907 storage_class_mapping : `~collections.abc.Mapping` of `str` to \ 

908 `~lsst.daf.butler.StorageClass`, optional 

909 If a taskdef contains a component dataset type that is unknown 

910 to the registry, its parent `~lsst.daf.butler.StorageClass` will 

911 be looked up in this mapping if it is supplied. If the mapping does 

912 not contain the composite dataset type, or the mapping is not 

913 supplied an exception will be raised. 

914 

915 Returns 

916 ------- 

917 types: `TaskDatasetTypes` 

918 The dataset types used by this task. 

919 

920 Raises 

921 ------ 

922 ValueError 

923 Raised if dataset type connection definition differs from 

924 registry definition. 

925 LookupError 

926 Raised if component parent StorageClass could not be determined 

927 and storage_class_mapping does not contain the composite type, or 

928 is set to None. 

929 """ 

930 

931 def makeDatasetTypesSet( 

932 connectionType: str, 

933 is_input: bool, 

934 freeze: bool = True, 

935 ) -> NamedValueSet[DatasetType]: 

936 """Construct a set of true `~lsst.daf.butler.DatasetType` objects. 

937 

938 Parameters 

939 ---------- 

940 connectionType : `str` 

941 Name of the connection type to produce a set for, corresponds 

942 to an attribute of type `list` on the connection class instance 

943 is_input : `bool` 

944 These are input dataset types, else they are output dataset 

945 types. 

946 freeze : `bool`, optional 

947 If `True`, call `NamedValueSet.freeze` on the object returned. 

948 

949 Returns 

950 ------- 

951 datasetTypes : `NamedValueSet` 

952 A set of all datasetTypes which correspond to the input 

953 connection type specified in the connection class of this 

954 `PipelineTask` 

955 

956 Raises 

957 ------ 

958 ValueError 

959 Raised if dataset type connection definition differs from 

960 registry definition. 

961 LookupError 

962 Raised if component parent StorageClass could not be determined 

963 and storage_class_mapping does not contain the composite type, 

964 or is set to None. 

965 

966 Notes 

967 ----- 

968 This function is a closure over the variables ``registry`` and 

969 ``taskDef``, and ``storage_class_mapping``. 

970 """ 

971 datasetTypes = NamedValueSet[DatasetType]() 

972 for c in iterConnections(taskDef.connections, connectionType): 

973 dimensions = set(getattr(c, "dimensions", set())) 

974 if "skypix" in dimensions: 

975 try: 

976 datasetType = registry.getDatasetType(c.name) 

977 except LookupError as err: 

978 raise LookupError( 

979 f"DatasetType '{c.name}' referenced by " 

980 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

981 "placeholder, but does not already exist in the registry. " 

982 "Note that reference catalog names are now used as the dataset " 

983 "type name instead of 'ref_cat'." 

984 ) from err 

985 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

986 rest2 = set( 

987 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

988 ) 

989 if rest1 != rest2: 

990 raise ValueError( 

991 f"Non-skypix dimensions for dataset type {c.name} declared in " 

992 f"connections ({rest1}) are inconsistent with those in " 

993 f"registry's version of this dataset ({rest2})." 

994 ) 

995 else: 

996 # Component dataset types are not explicitly in the 

997 # registry. This complicates consistency checks with 

998 # registry and requires we work out the composite storage 

999 # class. 

1000 registryDatasetType = None 

1001 try: 

1002 registryDatasetType = registry.getDatasetType(c.name) 

1003 except KeyError: 

1004 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

1005 if componentName: 

1006 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

1007 raise LookupError( 

1008 "Component parent class cannot be determined, and " 

1009 "composite name was not in storage class mapping, or no " 

1010 "storage_class_mapping was supplied" 

1011 ) 

1012 else: 

1013 parentStorageClass = storage_class_mapping[compositeName] 

1014 else: 

1015 parentStorageClass = None 

1016 datasetType = c.makeDatasetType( 

1017 registry.dimensions, parentStorageClass=parentStorageClass 

1018 ) 

1019 registryDatasetType = datasetType 

1020 else: 

1021 datasetType = c.makeDatasetType( 

1022 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

1023 ) 

1024 

1025 if registryDatasetType and datasetType != registryDatasetType: 

1026 # The dataset types differ but first check to see if 

1027 # they are compatible before raising. 

1028 if is_input: 

1029 # This DatasetType must be compatible on get. 

1030 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

1031 else: 

1032 # Has to be able to be converted to expect type 

1033 # on put. 

1034 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

1035 if is_compatible: 

1036 # For inputs we want the pipeline to use the 

1037 # pipeline definition, for outputs it should use 

1038 # the registry definition. 

1039 if not is_input: 

1040 datasetType = registryDatasetType 

1041 _LOG.debug( 

1042 "Dataset types differ (task %s != registry %s) but are compatible" 

1043 " for %s in %s.", 

1044 datasetType, 

1045 registryDatasetType, 

1046 "input" if is_input else "output", 

1047 taskDef.label, 

1048 ) 

1049 else: 

1050 try: 

1051 # Explicitly check for storage class just to 

1052 # make more specific message. 

1053 _ = datasetType.storageClass 

1054 except KeyError: 

1055 raise ValueError( 

1056 "Storage class does not exist for supplied dataset type " 

1057 f"{datasetType} for {taskDef.label}." 

1058 ) from None 

1059 raise ValueError( 

1060 f"Supplied dataset type ({datasetType}) inconsistent with " 

1061 f"registry definition ({registryDatasetType}) " 

1062 f"for {taskDef.label}." 

1063 ) 

1064 datasetTypes.add(datasetType) 

1065 if freeze: 

1066 datasetTypes.freeze() 

1067 return datasetTypes 

1068 

1069 # optionally add initOutput dataset for config 

1070 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

1071 if include_configs: 

1072 initOutputs.add( 

1073 DatasetType( 

1074 taskDef.configDatasetName, 

1075 registry.dimensions.empty, 

1076 storageClass="Config", 

1077 ) 

1078 ) 

1079 initOutputs.freeze() 

1080 

1081 # optionally add output dataset for metadata 

1082 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

1083 

1084 # Metadata is supposed to be of the TaskMetadata type, its dimensions 

1085 # correspond to a task quantum. 

1086 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1087 

1088 # Allow the storage class definition to be read from the existing 

1089 # dataset type definition if present. 

1090 try: 

1091 current = registry.getDatasetType(taskDef.metadataDatasetName) 

1092 except KeyError: 

1093 # No previous definition so use the default. 

1094 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

1095 else: 

1096 storageClass = current.storageClass.name 

1097 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

1098 

1099 if taskDef.logOutputDatasetName is not None: 

1100 # Log output dimensions correspond to a task quantum. 

1101 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1102 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}) 

1103 

1104 outputs.freeze() 

1105 

1106 inputs = makeDatasetTypesSet("inputs", is_input=True) 

1107 queryConstraints = NamedValueSet( 

1108 inputs[c.name] 

1109 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs")) 

1110 if not c.deferGraphConstraint 

1111 ) 

1112 

1113 return cls( 

1114 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

1115 initOutputs=initOutputs, 

1116 inputs=inputs, 

1117 queryConstraints=queryConstraints, 

1118 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

1119 outputs=outputs, 

1120 ) 

1121 

1122 

1123@dataclass(frozen=True) 

1124class PipelineDatasetTypes: 

1125 """An immutable struct that classifies the dataset types used in a 

1126 `Pipeline`. 

1127 """ 

1128 

1129 packagesDatasetName: ClassVar[str] = "packages" 

1130 """Name of a dataset type used to save package versions. 

1131 """ 

1132 

1133 initInputs: NamedValueSet[DatasetType] 

1134 """Dataset types that are needed as inputs in order to construct the Tasks 

1135 in this Pipeline. 

1136 

1137 This does not include dataset types that are produced when constructing 

1138 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1139 """ 

1140 

1141 initOutputs: NamedValueSet[DatasetType] 

1142 """Dataset types that may be written after constructing the Tasks in this 

1143 Pipeline. 

1144 

1145 This does not include dataset types that are also used as inputs when 

1146 constructing other Tasks in the Pipeline (these are classified as 

1147 `initIntermediates`). 

1148 """ 

1149 

1150 initIntermediates: NamedValueSet[DatasetType] 

1151 """Dataset types that are both used when constructing one or more Tasks 

1152 in the Pipeline and produced as a side-effect of constructing another 

1153 Task in the Pipeline. 

1154 """ 

1155 

1156 inputs: NamedValueSet[DatasetType] 

1157 """Dataset types that are regular inputs for the full pipeline. 

1158 

1159 If an input dataset needed for a Quantum cannot be found in the input 

1160 collection(s), that Quantum (and all dependent Quanta) will not be 

1161 produced. 

1162 """ 

1163 

1164 queryConstraints: NamedValueSet[DatasetType] 

1165 """Regular inputs that should be used as constraints on the initial 

1166 QuantumGraph generation data ID query, according to their tasks 

1167 (`NamedValueSet`). 

1168 """ 

1169 

1170 prerequisites: NamedValueSet[DatasetType] 

1171 """Dataset types that are prerequisite inputs for the full Pipeline. 

1172 

1173 Prerequisite inputs must exist in the input collection(s) before the 

1174 pipeline is run, but do not constrain the graph - if a prerequisite is 

1175 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1176 

1177 Prerequisite inputs are not resolved until the second stage of 

1178 QuantumGraph generation. 

1179 """ 

1180 

1181 intermediates: NamedValueSet[DatasetType] 

1182 """Dataset types that are output by one Task in the Pipeline and consumed 

1183 as inputs by one or more other Tasks in the Pipeline. 

1184 """ 

1185 

1186 outputs: NamedValueSet[DatasetType] 

1187 """Dataset types that are output by a Task in the Pipeline and not consumed 

1188 by any other Task in the Pipeline. 

1189 """ 

1190 

1191 byTask: Mapping[str, TaskDatasetTypes] 

1192 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1193 

1194 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1195 neither has been modified since the dataset types were extracted, of 

1196 course). 

1197 """ 

1198 

1199 @classmethod 

1200 def fromPipeline( 

1201 cls, 

1202 pipeline: Pipeline | Iterable[TaskDef], 

1203 *, 

1204 registry: Registry, 

1205 include_configs: bool = True, 

1206 include_packages: bool = True, 

1207 ) -> PipelineDatasetTypes: 

1208 """Extract and classify the dataset types from all tasks in a 

1209 `Pipeline`. 

1210 

1211 Parameters 

1212 ---------- 

1213 pipeline: `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1214 A collection of tasks that can be run together. 

1215 registry: `Registry` 

1216 Registry used to construct normalized 

1217 `~lsst.daf.butler.DatasetType` objects and retrieve those that are 

1218 incomplete. 

1219 include_configs : `bool`, optional 

1220 If `True` (default) include config dataset types as 

1221 ``initOutputs``. 

1222 include_packages : `bool`, optional 

1223 If `True` (default) include the dataset type for software package 

1224 versions in ``initOutputs``. 

1225 

1226 Returns 

1227 ------- 

1228 types: `PipelineDatasetTypes` 

1229 The dataset types used by this `Pipeline`. 

1230 

1231 Raises 

1232 ------ 

1233 ValueError 

1234 Raised if Tasks are inconsistent about which datasets are marked 

1235 prerequisite. This indicates that the Tasks cannot be run as part 

1236 of the same `Pipeline`. 

1237 """ 

1238 allInputs = NamedValueSet[DatasetType]() 

1239 allOutputs = NamedValueSet[DatasetType]() 

1240 allInitInputs = NamedValueSet[DatasetType]() 

1241 allInitOutputs = NamedValueSet[DatasetType]() 

1242 prerequisites = NamedValueSet[DatasetType]() 

1243 queryConstraints = NamedValueSet[DatasetType]() 

1244 byTask = dict() 

1245 if include_packages: 

1246 allInitOutputs.add( 

1247 DatasetType( 

1248 cls.packagesDatasetName, 

1249 registry.dimensions.empty, 

1250 storageClass="Packages", 

1251 ) 

1252 ) 

1253 # create a list of TaskDefs in case the input is a generator 

1254 pipeline = list(pipeline) 

1255 

1256 # collect all the output dataset types 

1257 typeStorageclassMap: dict[str, str] = {} 

1258 for taskDef in pipeline: 

1259 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1260 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1261 

1262 for taskDef in pipeline: 

1263 thisTask = TaskDatasetTypes.fromTaskDef( 

1264 taskDef, 

1265 registry=registry, 

1266 include_configs=include_configs, 

1267 storage_class_mapping=typeStorageclassMap, 

1268 ) 

1269 allInitInputs.update(thisTask.initInputs) 

1270 allInitOutputs.update(thisTask.initOutputs) 

1271 allInputs.update(thisTask.inputs) 

1272 # Inputs are query constraints if any task considers them a query 

1273 # constraint. 

1274 queryConstraints.update(thisTask.queryConstraints) 

1275 prerequisites.update(thisTask.prerequisites) 

1276 allOutputs.update(thisTask.outputs) 

1277 byTask[taskDef.label] = thisTask 

1278 if not prerequisites.isdisjoint(allInputs): 

1279 raise ValueError( 

1280 "{} marked as both prerequisites and regular inputs".format( 

1281 {dt.name for dt in allInputs & prerequisites} 

1282 ) 

1283 ) 

1284 if not prerequisites.isdisjoint(allOutputs): 

1285 raise ValueError( 

1286 "{} marked as both prerequisites and outputs".format( 

1287 {dt.name for dt in allOutputs & prerequisites} 

1288 ) 

1289 ) 

1290 # Make sure that components which are marked as inputs get treated as 

1291 # intermediates if there is an output which produces the composite 

1292 # containing the component 

1293 intermediateComponents = NamedValueSet[DatasetType]() 

1294 intermediateComposites = NamedValueSet[DatasetType]() 

1295 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1296 for dsType in allInputs: 

1297 # get the name of a possible component 

1298 name, component = dsType.nameAndComponent() 

1299 # if there is a component name, that means this is a component 

1300 # DatasetType, if there is an output which produces the parent of 

1301 # this component, treat this input as an intermediate 

1302 if component is not None: 

1303 # This needs to be in this if block, because someone might have 

1304 # a composite that is a pure input from existing data 

1305 if name in outputNameMapping: 

1306 intermediateComponents.add(dsType) 

1307 intermediateComposites.add(outputNameMapping[name]) 

1308 

1309 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1310 common = a.names & b.names 

1311 for name in common: 

1312 # Any compatibility is allowed. This function does not know 

1313 # if a dataset type is to be used for input or output. 

1314 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1315 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1316 

1317 checkConsistency(allInitInputs, allInitOutputs) 

1318 checkConsistency(allInputs, allOutputs) 

1319 checkConsistency(allInputs, intermediateComposites) 

1320 checkConsistency(allOutputs, intermediateComposites) 

1321 

1322 def frozen(s: Set[DatasetType]) -> NamedValueSet[DatasetType]: 

1323 assert isinstance(s, NamedValueSet) 

1324 s.freeze() 

1325 return s 

1326 

1327 inputs = frozen(allInputs - allOutputs - intermediateComponents) 

1328 

1329 return cls( 

1330 initInputs=frozen(allInitInputs - allInitOutputs), 

1331 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1332 initOutputs=frozen(allInitOutputs - allInitInputs), 

1333 inputs=inputs, 

1334 queryConstraints=frozen(queryConstraints & inputs), 

1335 # If there are storage class differences in inputs and outputs 

1336 # the intermediates have to choose priority. Here choose that 

1337 # inputs to tasks much match the requested storage class by 

1338 # applying the inputs over the top of the outputs. 

1339 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1340 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1341 prerequisites=frozen(prerequisites), 

1342 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1343 ) 

1344 

1345 @classmethod 

1346 def initOutputNames( 

1347 cls, 

1348 pipeline: Pipeline | Iterable[TaskDef], 

1349 *, 

1350 include_configs: bool = True, 

1351 include_packages: bool = True, 

1352 ) -> Iterator[str]: 

1353 """Return the names of dataset types ot task initOutputs, Configs, 

1354 and package versions for a pipeline. 

1355 

1356 Parameters 

1357 ---------- 

1358 pipeline: `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1359 A `Pipeline` instance or collection of `TaskDef` instances. 

1360 include_configs : `bool`, optional 

1361 If `True` (default) include config dataset types. 

1362 include_packages : `bool`, optional 

1363 If `True` (default) include the dataset type for package versions. 

1364 

1365 Yields 

1366 ------ 

1367 datasetTypeName : `str` 

1368 Name of the dataset type. 

1369 """ 

1370 if include_packages: 

1371 # Package versions dataset type 

1372 yield cls.packagesDatasetName 

1373 

1374 if isinstance(pipeline, Pipeline): 

1375 pipeline = pipeline.toExpandedPipeline() 

1376 

1377 for taskDef in pipeline: 

1378 # all task InitOutputs 

1379 for name in taskDef.connections.initOutputs: 

1380 attribute = getattr(taskDef.connections, name) 

1381 yield attribute.name 

1382 

1383 # config dataset name 

1384 if include_configs: 

1385 yield taskDef.configDatasetName