Coverage for python/lsst/pipe/base/pipeline.py: 21%

439 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-16 09:02 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import re 

31import urllib.parse 

32 

33# ------------------------------- 

34# Imports of standard modules -- 

35# ------------------------------- 

36from collections.abc import Callable, Generator, Iterable, Iterator, Mapping, Set 

37from dataclasses import dataclass 

38from types import MappingProxyType 

39from typing import TYPE_CHECKING, ClassVar, cast 

40 

41# ----------------------------- 

42# Imports for other modules -- 

43from lsst.daf.butler import ( 

44 DataCoordinate, 

45 DatasetType, 

46 DimensionUniverse, 

47 NamedValueSet, 

48 Registry, 

49 SkyPixDimension, 

50) 

51from lsst.resources import ResourcePath, ResourcePathExpression 

52from lsst.utils import doImportType 

53from lsst.utils.introspection import get_full_type_name 

54 

55from . import automatic_connection_constants as acc 

56from . import pipelineIR, pipeTools 

57from ._instrument import Instrument as PipeBaseInstrument 

58from ._task_metadata import TaskMetadata 

59from .config import PipelineTaskConfig 

60from .connections import iterConnections 

61from .connectionTypes import Input 

62from .pipelineTask import PipelineTask 

63from .task import _TASK_METADATA_TYPE 

64 

65if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

66 from lsst.obs.base import Instrument 

67 from lsst.pex.config import Config 

68 

69# ---------------------------------- 

70# Local non-exported definitions -- 

71# ---------------------------------- 

72 

73_LOG = logging.getLogger(__name__) 

74 

75# ------------------------ 

76# Exported definitions -- 

77# ------------------------ 

78 

79 

80@dataclass 

81class LabelSpecifier: 

82 """A structure to specify a subset of labels to load 

83 

84 This structure may contain a set of labels to be used in subsetting a 

85 pipeline, or a beginning and end point. Beginning or end may be empty, 

86 in which case the range will be a half open interval. Unlike python 

87 iteration bounds, end bounds are *INCLUDED*. Note that range based 

88 selection is not well defined for pipelines that are not linear in nature, 

89 and correct behavior is not guaranteed, or may vary from run to run. 

90 """ 

91 

92 labels: set[str] | None = None 

93 begin: str | None = None 

94 end: str | None = None 

95 

96 def __post_init__(self) -> None: 

97 if self.labels is not None and (self.begin or self.end): 

98 raise ValueError( 

99 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

100 ) 

101 

102 

103class TaskDef: 

104 """TaskDef is a collection of information about task needed by Pipeline. 

105 

106 The information includes task name, configuration object and optional 

107 task class. This class is just a collection of attributes and it exposes 

108 all of them so that attributes could potentially be modified in place 

109 (e.g. if configuration needs extra overrides). 

110 

111 Attributes 

112 ---------- 

113 taskName : `str`, optional 

114 The fully-qualified `PipelineTask` class name. If not provided, 

115 ``taskClass`` must be. 

116 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional 

117 Instance of the configuration class corresponding to this task class, 

118 usually with all overrides applied. This config will be frozen. If 

119 not provided, ``taskClass`` must be provided and 

120 ``taskClass.ConfigClass()`` will be used. 

121 taskClass : `type`, optional 

122 `PipelineTask` class object; if provided and ``taskName`` is as well, 

123 the caller guarantees that they are consistent. If not provided, 

124 ``taskName`` is used to import the type. 

125 label : `str`, optional 

126 Task label, usually a short string unique in a pipeline. If not 

127 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

128 be used. 

129 """ 

130 

131 def __init__( 

132 self, 

133 taskName: str | None = None, 

134 config: PipelineTaskConfig | None = None, 

135 taskClass: type[PipelineTask] | None = None, 

136 label: str | None = None, 

137 ): 

138 if taskName is None: 

139 if taskClass is None: 

140 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

141 taskName = get_full_type_name(taskClass) 

142 elif taskClass is None: 

143 taskClass = doImportType(taskName) 

144 if config is None: 

145 if taskClass is None: 

146 raise ValueError("`taskClass` must be provided if `config` is not.") 

147 config = taskClass.ConfigClass() 

148 if label is None: 

149 if taskClass is None: 

150 raise ValueError("`taskClass` must be provided if `label` is not.") 

151 label = taskClass._DefaultName 

152 self.taskName = taskName 

153 try: 

154 config.validate() 

155 except Exception: 

156 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

157 raise 

158 config.freeze() 

159 self.config = config 

160 self.taskClass = taskClass 

161 self.label = label 

162 self.connections = config.connections.ConnectionsClass(config=config) 

163 

164 @property 

165 def configDatasetName(self) -> str: 

166 """Name of a dataset type for configuration of this task (`str`)""" 

167 return acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.label) 

168 

169 @property 

170 def metadataDatasetName(self) -> str: 

171 """Name of a dataset type for metadata of this task (`str`)""" 

172 return self.makeMetadataDatasetName(self.label) 

173 

174 @classmethod 

175 def makeMetadataDatasetName(cls, label: str) -> str: 

176 """Construct the name of the dataset type for metadata for a task. 

177 

178 Parameters 

179 ---------- 

180 label : `str` 

181 Label for the task within its pipeline. 

182 

183 Returns 

184 ------- 

185 name : `str` 

186 Name of the task's metadata dataset type. 

187 """ 

188 return acc.METADATA_OUTPUT_TEMPLATE.format(label=label) 

189 

190 @property 

191 def logOutputDatasetName(self) -> str | None: 

192 """Name of a dataset type for log output from this task, `None` if 

193 logs are not to be saved (`str`) 

194 """ 

195 if cast(PipelineTaskConfig, self.config).saveLogOutput: 

196 return acc.LOG_OUTPUT_TEMPLATE.format(label=self.label) 

197 else: 

198 return None 

199 

200 def __str__(self) -> str: 

201 rep = "TaskDef(" + self.taskName 

202 if self.label: 

203 rep += ", label=" + self.label 

204 rep += ")" 

205 return rep 

206 

207 def __eq__(self, other: object) -> bool: 

208 if not isinstance(other, TaskDef): 

209 return False 

210 # This does not consider equality of configs when determining equality 

211 # as config equality is a difficult thing to define. Should be updated 

212 # after DM-27847 

213 return self.taskClass == other.taskClass and self.label == other.label 

214 

215 def __hash__(self) -> int: 

216 return hash((self.taskClass, self.label)) 

217 

218 @classmethod 

219 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef: 

220 """Unpickle pickle. Custom callable for unpickling. 

221 

222 All arguments are forwarded directly to the constructor; this 

223 trampoline is only needed because ``__reduce__`` callables can't be 

224 called with keyword arguments. 

225 """ 

226 return cls(taskName=taskName, config=config, label=label) 

227 

228 def __reduce__(self) -> tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], tuple[str, Config, str]]: 

229 return (self._unreduce, (self.taskName, self.config, self.label)) 

230 

231 

232class Pipeline: 

233 """A `Pipeline` is a representation of a series of tasks to run, and the 

234 configuration for those tasks. 

235 

236 Parameters 

237 ---------- 

238 description : `str` 

239 A description of that this pipeline does. 

240 """ 

241 

242 def __init__(self, description: str): 

243 pipeline_dict = {"description": description, "tasks": {}} 

244 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

245 

246 @classmethod 

247 def fromFile(cls, filename: str) -> Pipeline: 

248 """Load a pipeline defined in a pipeline yaml file. 

249 

250 Parameters 

251 ---------- 

252 filename: `str` 

253 A path that points to a pipeline defined in yaml format. This 

254 filename may also supply additional labels to be used in 

255 subsetting the loaded Pipeline. These labels are separated from 

256 the path by a ``#``, and may be specified as a comma separated 

257 list, or a range denoted as beginning..end. Beginning or end may 

258 be empty, in which case the range will be a half open interval. 

259 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

260 that range based selection is not well defined for pipelines that 

261 are not linear in nature, and correct behavior is not guaranteed, 

262 or may vary from run to run. 

263 

264 Returns 

265 ------- 

266 pipeline: `Pipeline` 

267 The pipeline loaded from specified location with appropriate (if 

268 any) subsetting. 

269 

270 Notes 

271 ----- 

272 This method attempts to prune any contracts that contain labels which 

273 are not in the declared subset of labels. This pruning is done using a 

274 string based matching due to the nature of contracts and may prune more 

275 than it should. 

276 """ 

277 return cls.from_uri(filename) 

278 

279 @classmethod 

280 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

281 """Load a pipeline defined in a pipeline yaml file at a location 

282 specified by a URI. 

283 

284 Parameters 

285 ---------- 

286 uri : convertible to `~lsst.resources.ResourcePath` 

287 If a string is supplied this should be a URI path that points to a 

288 pipeline defined in yaml format, either as a direct path to the 

289 yaml file, or as a directory containing a ``pipeline.yaml`` file 

290 the form used by `write_to_uri` with ``expand=True``). This uri may 

291 also supply additional labels to be used in subsetting the loaded 

292 `Pipeline`. These labels are separated from the path by a ``#``, 

293 and may be specified as a comma separated list, or a range denoted 

294 as beginning..end. Beginning or end may be empty, in which case the 

295 range will be a half open interval. Unlike python iteration bounds, 

296 end bounds are *INCLUDED*. Note that range based selection is not 

297 well defined for pipelines that are not linear in nature, and 

298 correct behavior is not guaranteed, or may vary from run to run. 

299 The same specifiers can be used with a 

300 `~lsst.resources.ResourcePath` object, by being the sole contents 

301 in the fragments attribute. 

302 

303 Returns 

304 ------- 

305 pipeline : `Pipeline` 

306 The pipeline loaded from specified location with appropriate (if 

307 any) subsetting. 

308 

309 Notes 

310 ----- 

311 This method attempts to prune any contracts that contain labels which 

312 are not in the declared subset of labels. This pruning is done using a 

313 string based matching due to the nature of contracts and may prune more 

314 than it should. 

315 """ 

316 # Split up the uri and any labels that were supplied 

317 uri, label_specifier = cls._parse_file_specifier(uri) 

318 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

319 

320 # If there are labels supplied, only keep those 

321 if label_specifier is not None: 

322 pipeline = pipeline.subsetFromLabels(label_specifier) 

323 return pipeline 

324 

325 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

326 """Subset a pipeline to contain only labels specified in labelSpecifier 

327 

328 Parameters 

329 ---------- 

330 labelSpecifier : `labelSpecifier` 

331 Object containing labels that describes how to subset a pipeline. 

332 

333 Returns 

334 ------- 

335 pipeline : `Pipeline` 

336 A new pipeline object that is a subset of the old pipeline 

337 

338 Raises 

339 ------ 

340 ValueError 

341 Raised if there is an issue with specified labels 

342 

343 Notes 

344 ----- 

345 This method attempts to prune any contracts that contain labels which 

346 are not in the declared subset of labels. This pruning is done using a 

347 string based matching due to the nature of contracts and may prune more 

348 than it should. 

349 """ 

350 # Labels supplied as a set 

351 if labelSpecifier.labels: 

352 labelSet = labelSpecifier.labels 

353 # Labels supplied as a range, first create a list of all the labels 

354 # in the pipeline sorted according to task dependency. Then only 

355 # keep labels that lie between the supplied bounds 

356 else: 

357 # Create a copy of the pipeline to use when assessing the label 

358 # ordering. Use a dict for fast searching while preserving order. 

359 # Remove contracts so they do not fail in the expansion step. This 

360 # is needed because a user may only configure the tasks they intend 

361 # to run, which may cause some contracts to fail if they will later 

362 # be dropped 

363 pipeline = copy.deepcopy(self) 

364 pipeline._pipelineIR.contracts = [] 

365 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

366 

367 # Verify the bounds are in the labels 

368 if labelSpecifier.begin is not None: 

369 if labelSpecifier.begin not in labels: 

370 raise ValueError( 

371 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition" 

372 ) 

373 if labelSpecifier.end is not None: 

374 if labelSpecifier.end not in labels: 

375 raise ValueError( 

376 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

377 ) 

378 

379 labelSet = set() 

380 for label in labels: 

381 if labelSpecifier.begin is not None: 

382 if label != labelSpecifier.begin: 

383 continue 

384 else: 

385 labelSpecifier.begin = None 

386 labelSet.add(label) 

387 if labelSpecifier.end is not None and label == labelSpecifier.end: 

388 break 

389 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

390 

391 @staticmethod 

392 def _parse_file_specifier(uri: ResourcePathExpression) -> tuple[ResourcePath, LabelSpecifier | None]: 

393 """Split appart a uri and any possible label subsets""" 

394 if isinstance(uri, str): 

395 # This is to support legacy pipelines during transition 

396 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

397 if num_replace: 

398 raise ValueError( 

399 f"The pipeline file {uri} seems to use the legacy :" 

400 " to separate labels, please use # instead." 

401 ) 

402 if uri.count("#") > 1: 

403 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

404 # Everything else can be converted directly to ResourcePath. 

405 uri = ResourcePath(uri) 

406 label_subset = uri.fragment or None 

407 

408 specifier: LabelSpecifier | None 

409 if label_subset is not None: 

410 label_subset = urllib.parse.unquote(label_subset) 

411 args: dict[str, set[str] | str | None] 

412 # labels supplied as a list 

413 if "," in label_subset: 

414 if ".." in label_subset: 

415 raise ValueError( 

416 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

417 ) 

418 args = {"labels": set(label_subset.split(","))} 

419 # labels supplied as a range 

420 elif ".." in label_subset: 

421 # Try to de-structure the labelSubset, this will fail if more 

422 # than one range is specified 

423 begin, end, *rest = label_subset.split("..") 

424 if rest: 

425 raise ValueError("Only one range can be specified when loading a pipeline") 

426 args = {"begin": begin if begin else None, "end": end if end else None} 

427 # Assume anything else is a single label 

428 else: 

429 args = {"labels": {label_subset}} 

430 

431 # MyPy doesn't like how cavalier kwarg construction is with types. 

432 specifier = LabelSpecifier(**args) # type: ignore 

433 else: 

434 specifier = None 

435 

436 return uri, specifier 

437 

438 @classmethod 

439 def fromString(cls, pipeline_string: str) -> Pipeline: 

440 """Create a pipeline from string formatted as a pipeline document. 

441 

442 Parameters 

443 ---------- 

444 pipeline_string : `str` 

445 A string that is formatted according like a pipeline document 

446 

447 Returns 

448 ------- 

449 pipeline: `Pipeline` 

450 """ 

451 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

452 return pipeline 

453 

454 @classmethod 

455 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

456 """Create a pipeline from an already created `PipelineIR` object. 

457 

458 Parameters 

459 ---------- 

460 deserialized_pipeline: `PipelineIR` 

461 An already created pipeline intermediate representation object 

462 

463 Returns 

464 ------- 

465 pipeline: `Pipeline` 

466 """ 

467 pipeline = cls.__new__(cls) 

468 pipeline._pipelineIR = deserialized_pipeline 

469 return pipeline 

470 

471 @classmethod 

472 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

473 """Create a new pipeline by copying an already existing `Pipeline`. 

474 

475 Parameters 

476 ---------- 

477 pipeline: `Pipeline` 

478 An already created pipeline intermediate representation object 

479 

480 Returns 

481 ------- 

482 pipeline: `Pipeline` 

483 """ 

484 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

485 

486 def __str__(self) -> str: 

487 return str(self._pipelineIR) 

488 

489 def mergePipeline(self, pipeline: Pipeline) -> None: 

490 """Merge another in-memory `Pipeline` object into this one. 

491 

492 This merges another pipeline into this object, as if it were declared 

493 in the import block of the yaml definition of this pipeline. This 

494 modifies this pipeline in place. 

495 

496 Parameters 

497 ---------- 

498 pipeline : `Pipeline` 

499 The `Pipeline` object that is to be merged into this object. 

500 """ 

501 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,)) 

502 

503 def addLabelToSubset(self, subset: str, label: str) -> None: 

504 """Add a task label from the specified subset. 

505 

506 Parameters 

507 ---------- 

508 subset : `str` 

509 The labeled subset to modify 

510 label : `str` 

511 The task label to add to the specified subset. 

512 

513 Raises 

514 ------ 

515 ValueError 

516 Raised if the specified subset does not exist within the pipeline. 

517 Raised if the specified label does not exist within the pipeline. 

518 """ 

519 if label not in self._pipelineIR.tasks: 

520 raise ValueError(f"Label {label} does not appear within the pipeline") 

521 if subset not in self._pipelineIR.labeled_subsets: 

522 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

523 self._pipelineIR.labeled_subsets[subset].subset.add(label) 

524 

525 def removeLabelFromSubset(self, subset: str, label: str) -> None: 

526 """Remove a task label from the specified subset. 

527 

528 Parameters 

529 ---------- 

530 subset : `str` 

531 The labeled subset to modify 

532 label : `str` 

533 The task label to remove from the specified subset. 

534 

535 Raises 

536 ------ 

537 ValueError 

538 Raised if the specified subset does not exist in the pipeline. 

539 Raised if the specified label does not exist within the specified 

540 subset. 

541 """ 

542 if subset not in self._pipelineIR.labeled_subsets: 

543 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

544 if label not in self._pipelineIR.labeled_subsets[subset].subset: 

545 raise ValueError(f"Label {label} does not appear within the pipeline") 

546 self._pipelineIR.labeled_subsets[subset].subset.remove(label) 

547 

548 def findSubsetsWithLabel(self, label: str) -> set[str]: 

549 """Find any subsets which may contain the specified label. 

550 

551 This function returns the name of subsets which return the specified 

552 label. May return an empty set if there are no subsets, or no subsets 

553 containing the specified label. 

554 

555 Parameters 

556 ---------- 

557 label : `str` 

558 The task label to use in membership check 

559 

560 Returns 

561 ------- 

562 subsets : `set` of `str` 

563 Returns a set (possibly empty) of subsets names which contain the 

564 specified label. 

565 

566 Raises 

567 ------ 

568 ValueError 

569 Raised if the specified label does not exist within this pipeline. 

570 """ 

571 results = set() 

572 if label not in self._pipelineIR.tasks: 

573 raise ValueError(f"Label {label} does not appear within the pipeline") 

574 for subset in self._pipelineIR.labeled_subsets.values(): 

575 if label in subset.subset: 

576 results.add(subset.label) 

577 return results 

578 

579 def addInstrument(self, instrument: Instrument | str) -> None: 

580 """Add an instrument to the pipeline, or replace an instrument that is 

581 already defined. 

582 

583 Parameters 

584 ---------- 

585 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

586 Either a derived class object of a `lsst.daf.butler.instrument` or 

587 a string corresponding to a fully qualified 

588 `lsst.daf.butler.instrument` name. 

589 """ 

590 if isinstance(instrument, str): 

591 pass 

592 else: 

593 # TODO: assume that this is a subclass of Instrument, no type 

594 # checking 

595 instrument = get_full_type_name(instrument) 

596 self._pipelineIR.instrument = instrument 

597 

598 def getInstrument(self) -> str | None: 

599 """Get the instrument from the pipeline. 

600 

601 Returns 

602 ------- 

603 instrument : `str`, or None 

604 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

605 name, or None if the pipeline does not have an instrument. 

606 """ 

607 return self._pipelineIR.instrument 

608 

609 def get_data_id(self, universe: DimensionUniverse) -> DataCoordinate: 

610 """Return a data ID with all dimension constraints embedded in the 

611 pipeline. 

612 

613 Parameters 

614 ---------- 

615 universe : `lsst.daf.butler.DimensionUniverse` 

616 Object that defines all dimensions. 

617 

618 Returns 

619 ------- 

620 data_id : `lsst.daf.butler.DataCoordinate` 

621 Data ID with all dimension constraints embedded in the 

622 pipeline. 

623 """ 

624 instrument_class_name = self._pipelineIR.instrument 

625 if instrument_class_name is not None: 

626 instrument_class = doImportType(instrument_class_name) 

627 if instrument_class is not None: 

628 return DataCoordinate.standardize(instrument=instrument_class.getName(), universe=universe) 

629 return DataCoordinate.makeEmpty(universe) 

630 

631 def addTask(self, task: type[PipelineTask] | str, label: str) -> None: 

632 """Add a new task to the pipeline, or replace a task that is already 

633 associated with the supplied label. 

634 

635 Parameters 

636 ---------- 

637 task: `PipelineTask` or `str` 

638 Either a derived class object of a `PipelineTask` or a string 

639 corresponding to a fully qualified `PipelineTask` name. 

640 label: `str` 

641 A label that is used to identify the `PipelineTask` being added 

642 """ 

643 if isinstance(task, str): 

644 taskName = task 

645 elif issubclass(task, PipelineTask): 

646 taskName = get_full_type_name(task) 

647 else: 

648 raise ValueError( 

649 "task must be either a child class of PipelineTask or a string containing" 

650 " a fully qualified name to one" 

651 ) 

652 if not label: 

653 # in some cases (with command line-generated pipeline) tasks can 

654 # be defined without label which is not acceptable, use task 

655 # _DefaultName in that case 

656 if isinstance(task, str): 

657 task_class = doImportType(task) 

658 label = task_class._DefaultName 

659 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

660 

661 def removeTask(self, label: str) -> None: 

662 """Remove a task from the pipeline. 

663 

664 Parameters 

665 ---------- 

666 label : `str` 

667 The label used to identify the task that is to be removed 

668 

669 Raises 

670 ------ 

671 KeyError 

672 If no task with that label exists in the pipeline 

673 

674 """ 

675 self._pipelineIR.tasks.pop(label) 

676 

677 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

678 """Apply single config override. 

679 

680 Parameters 

681 ---------- 

682 label : `str` 

683 Label of the task. 

684 key: `str` 

685 Fully-qualified field name. 

686 value : object 

687 Value to be given to a field. 

688 """ 

689 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

690 

691 def addConfigFile(self, label: str, filename: str) -> None: 

692 """Add overrides from a specified file. 

693 

694 Parameters 

695 ---------- 

696 label : `str` 

697 The label used to identify the task associated with config to 

698 modify 

699 filename : `str` 

700 Path to the override file. 

701 """ 

702 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

703 

704 def addConfigPython(self, label: str, pythonString: str) -> None: 

705 """Add Overrides by running a snippet of python code against a config. 

706 

707 Parameters 

708 ---------- 

709 label : `str` 

710 The label used to identity the task associated with config to 

711 modify. 

712 pythonString: `str` 

713 A string which is valid python code to be executed. This is done 

714 with config as the only local accessible value. 

715 """ 

716 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

717 

718 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

719 if label == "parameters": 

720 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

721 if newConfig.file: 

722 raise ValueError("Setting parameters section with config file is not supported") 

723 if newConfig.python: 

724 raise ValueError("Setting parameters section using python block in unsupported") 

725 return 

726 if label not in self._pipelineIR.tasks: 

727 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

728 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

729 

730 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

731 """Write the pipeline to a file or directory. 

732 

733 Parameters 

734 ---------- 

735 uri : convertible to `~lsst.resources.ResourcePath` 

736 URI to write to; may have any scheme with 

737 `~lsst.resources.ResourcePath` write support or no scheme for a 

738 local file/directory. Should have a ``.yaml`` extension. 

739 """ 

740 self._pipelineIR.write_to_uri(uri) 

741 

742 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

743 r"""Return a generator of `TaskDef`\s which can be used to create 

744 quantum graphs. 

745 

746 Returns 

747 ------- 

748 generator : generator of `TaskDef` 

749 The generator returned will be the sorted iterator of tasks which 

750 are to be used in constructing a quantum graph. 

751 

752 Raises 

753 ------ 

754 NotImplementedError 

755 If a dataId is supplied in a config block. This is in place for 

756 future use 

757 """ 

758 taskDefs = [] 

759 for label in self._pipelineIR.tasks: 

760 taskDefs.append(self._buildTaskDef(label)) 

761 

762 # lets evaluate the contracts 

763 if self._pipelineIR.contracts is not None: 

764 label_to_config = {x.label: x.config for x in taskDefs} 

765 for contract in self._pipelineIR.contracts: 

766 # execute this in its own line so it can raise a good error 

767 # message if there was problems with the eval 

768 success = eval(contract.contract, None, label_to_config) 

769 if not success: 

770 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

771 raise pipelineIR.ContractError( 

772 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

773 ) 

774 

775 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

776 yield from pipeTools.orderPipeline(taskDefs) 

777 

778 def _buildTaskDef(self, label: str) -> TaskDef: 

779 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

780 raise NameError(f"Label {label} does not appear in this pipeline") 

781 taskClass: type[PipelineTask] = doImportType(taskIR.klass) 

782 taskName = get_full_type_name(taskClass) 

783 config = taskClass.ConfigClass() 

784 instrument: PipeBaseInstrument | None = None 

785 if (instrumentName := self._pipelineIR.instrument) is not None: 

786 instrument_cls: type = doImportType(instrumentName) 

787 instrument = instrument_cls() 

788 config.applyConfigOverrides( 

789 instrument, 

790 getattr(taskClass, "_DefaultName", ""), 

791 taskIR.config, 

792 self._pipelineIR.parameters, 

793 label, 

794 ) 

795 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

796 

797 def __iter__(self) -> Generator[TaskDef, None, None]: 

798 return self.toExpandedPipeline() 

799 

800 def __getitem__(self, item: str) -> TaskDef: 

801 return self._buildTaskDef(item) 

802 

803 def __len__(self) -> int: 

804 return len(self._pipelineIR.tasks) 

805 

806 def __eq__(self, other: object) -> bool: 

807 if not isinstance(other, Pipeline): 

808 return False 

809 elif self._pipelineIR == other._pipelineIR: 

810 # Shortcut: if the IR is the same, the expanded pipeline must be 

811 # the same as well. But the converse is not true. 

812 return True 

813 else: 

814 self_expanded = {td.label: (td.taskClass,) for td in self} 

815 other_expanded = {td.label: (td.taskClass,) for td in other} 

816 if self_expanded != other_expanded: 

817 return False 

818 # After DM-27847, we should compare configuration here, or better, 

819 # delegated to TaskDef.__eq__ after making that compare configurations. 

820 raise NotImplementedError( 

821 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847." 

822 ) 

823 

824 

825@dataclass(frozen=True) 

826class TaskDatasetTypes: 

827 """An immutable struct that extracts and classifies the dataset types used 

828 by a `PipelineTask` 

829 """ 

830 

831 initInputs: NamedValueSet[DatasetType] 

832 """Dataset types that are needed as inputs in order to construct this Task. 

833 

834 Task-level `initInputs` may be classified as either 

835 `~PipelineDatasetTypes.initInputs` or 

836 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

837 """ 

838 

839 initOutputs: NamedValueSet[DatasetType] 

840 """Dataset types that may be written after constructing this Task. 

841 

842 Task-level `initOutputs` may be classified as either 

843 `~PipelineDatasetTypes.initOutputs` or 

844 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

845 """ 

846 

847 inputs: NamedValueSet[DatasetType] 

848 """Dataset types that are regular inputs to this Task. 

849 

850 If an input dataset needed for a Quantum cannot be found in the input 

851 collection(s) or produced by another Task in the Pipeline, that Quantum 

852 (and all dependent Quanta) will not be produced. 

853 

854 Task-level `inputs` may be classified as either 

855 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

856 at the Pipeline level. 

857 """ 

858 

859 queryConstraints: NamedValueSet[DatasetType] 

860 """Regular inputs that should not be used as constraints on the initial 

861 QuantumGraph generation data ID query, according to their tasks 

862 (`NamedValueSet`). 

863 """ 

864 

865 prerequisites: NamedValueSet[DatasetType] 

866 """Dataset types that are prerequisite inputs to this Task. 

867 

868 Prerequisite inputs must exist in the input collection(s) before the 

869 pipeline is run, but do not constrain the graph - if a prerequisite is 

870 missing for a Quantum, `PrerequisiteMissingError` is raised. 

871 

872 Prerequisite inputs are not resolved until the second stage of 

873 QuantumGraph generation. 

874 """ 

875 

876 outputs: NamedValueSet[DatasetType] 

877 """Dataset types that are produced by this Task. 

878 

879 Task-level `outputs` may be classified as either 

880 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

881 at the Pipeline level. 

882 """ 

883 

884 @classmethod 

885 def fromTaskDef( 

886 cls, 

887 taskDef: TaskDef, 

888 *, 

889 registry: Registry, 

890 include_configs: bool = True, 

891 storage_class_mapping: Mapping[str, str] | None = None, 

892 ) -> TaskDatasetTypes: 

893 """Extract and classify the dataset types from a single `PipelineTask`. 

894 

895 Parameters 

896 ---------- 

897 taskDef: `TaskDef` 

898 An instance of a `TaskDef` class for a particular `PipelineTask`. 

899 registry: `Registry` 

900 Registry used to construct normalized 

901 `~lsst.daf.butler.DatasetType` objects and retrieve those that are 

902 incomplete. 

903 include_configs : `bool`, optional 

904 If `True` (default) include config dataset types as 

905 ``initOutputs``. 

906 storage_class_mapping : `~collections.abc.Mapping` of `str` to \ 

907 `StorageClass`, optional 

908 If a taskdef contains a component dataset type that is unknown 

909 to the registry, its parent `~lsst.daf.butler.StorageClass` will 

910 be looked up in this mapping if it is supplied. If the mapping does 

911 not contain the composite dataset type, or the mapping is not 

912 supplied an exception will be raised. 

913 

914 Returns 

915 ------- 

916 types: `TaskDatasetTypes` 

917 The dataset types used by this task. 

918 

919 Raises 

920 ------ 

921 ValueError 

922 Raised if dataset type connection definition differs from 

923 registry definition. 

924 LookupError 

925 Raised if component parent StorageClass could not be determined 

926 and storage_class_mapping does not contain the composite type, or 

927 is set to None. 

928 """ 

929 

930 def makeDatasetTypesSet( 

931 connectionType: str, 

932 is_input: bool, 

933 freeze: bool = True, 

934 ) -> NamedValueSet[DatasetType]: 

935 """Construct a set of true `~lsst.daf.butler.DatasetType` objects. 

936 

937 Parameters 

938 ---------- 

939 connectionType : `str` 

940 Name of the connection type to produce a set for, corresponds 

941 to an attribute of type `list` on the connection class instance 

942 is_input : `bool` 

943 These are input dataset types, else they are output dataset 

944 types. 

945 freeze : `bool`, optional 

946 If `True`, call `NamedValueSet.freeze` on the object returned. 

947 

948 Returns 

949 ------- 

950 datasetTypes : `NamedValueSet` 

951 A set of all datasetTypes which correspond to the input 

952 connection type specified in the connection class of this 

953 `PipelineTask` 

954 

955 Raises 

956 ------ 

957 ValueError 

958 Raised if dataset type connection definition differs from 

959 registry definition. 

960 LookupError 

961 Raised if component parent StorageClass could not be determined 

962 and storage_class_mapping does not contain the composite type, 

963 or is set to None. 

964 

965 Notes 

966 ----- 

967 This function is a closure over the variables ``registry`` and 

968 ``taskDef``, and ``storage_class_mapping``. 

969 """ 

970 datasetTypes = NamedValueSet[DatasetType]() 

971 for c in iterConnections(taskDef.connections, connectionType): 

972 dimensions = set(getattr(c, "dimensions", set())) 

973 if "skypix" in dimensions: 

974 try: 

975 datasetType = registry.getDatasetType(c.name) 

976 except LookupError as err: 

977 raise LookupError( 

978 f"DatasetType '{c.name}' referenced by " 

979 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

980 "placeholder, but does not already exist in the registry. " 

981 "Note that reference catalog names are now used as the dataset " 

982 "type name instead of 'ref_cat'." 

983 ) from err 

984 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

985 rest2 = set( 

986 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

987 ) 

988 if rest1 != rest2: 

989 raise ValueError( 

990 f"Non-skypix dimensions for dataset type {c.name} declared in " 

991 f"connections ({rest1}) are inconsistent with those in " 

992 f"registry's version of this dataset ({rest2})." 

993 ) 

994 else: 

995 # Component dataset types are not explicitly in the 

996 # registry. This complicates consistency checks with 

997 # registry and requires we work out the composite storage 

998 # class. 

999 registryDatasetType = None 

1000 try: 

1001 registryDatasetType = registry.getDatasetType(c.name) 

1002 except KeyError: 

1003 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

1004 if componentName: 

1005 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

1006 raise LookupError( 

1007 "Component parent class cannot be determined, and " 

1008 "composite name was not in storage class mapping, or no " 

1009 "storage_class_mapping was supplied" 

1010 ) 

1011 else: 

1012 parentStorageClass = storage_class_mapping[compositeName] 

1013 else: 

1014 parentStorageClass = None 

1015 datasetType = c.makeDatasetType( 

1016 registry.dimensions, parentStorageClass=parentStorageClass 

1017 ) 

1018 registryDatasetType = datasetType 

1019 else: 

1020 datasetType = c.makeDatasetType( 

1021 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

1022 ) 

1023 

1024 if registryDatasetType and datasetType != registryDatasetType: 

1025 # The dataset types differ but first check to see if 

1026 # they are compatible before raising. 

1027 if is_input: 

1028 # This DatasetType must be compatible on get. 

1029 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

1030 else: 

1031 # Has to be able to be converted to expect type 

1032 # on put. 

1033 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

1034 if is_compatible: 

1035 # For inputs we want the pipeline to use the 

1036 # pipeline definition, for outputs it should use 

1037 # the registry definition. 

1038 if not is_input: 

1039 datasetType = registryDatasetType 

1040 _LOG.debug( 

1041 "Dataset types differ (task %s != registry %s) but are compatible" 

1042 " for %s in %s.", 

1043 datasetType, 

1044 registryDatasetType, 

1045 "input" if is_input else "output", 

1046 taskDef.label, 

1047 ) 

1048 else: 

1049 try: 

1050 # Explicitly check for storage class just to 

1051 # make more specific message. 

1052 _ = datasetType.storageClass 

1053 except KeyError: 

1054 raise ValueError( 

1055 "Storage class does not exist for supplied dataset type " 

1056 f"{datasetType} for {taskDef.label}." 

1057 ) from None 

1058 raise ValueError( 

1059 f"Supplied dataset type ({datasetType}) inconsistent with " 

1060 f"registry definition ({registryDatasetType}) " 

1061 f"for {taskDef.label}." 

1062 ) 

1063 datasetTypes.add(datasetType) 

1064 if freeze: 

1065 datasetTypes.freeze() 

1066 return datasetTypes 

1067 

1068 # optionally add initOutput dataset for config 

1069 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

1070 if include_configs: 

1071 initOutputs.add( 

1072 DatasetType( 

1073 taskDef.configDatasetName, 

1074 registry.dimensions.empty, 

1075 storageClass="Config", 

1076 ) 

1077 ) 

1078 initOutputs.freeze() 

1079 

1080 # optionally add output dataset for metadata 

1081 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

1082 

1083 # Metadata is supposed to be of the TaskMetadata type, its dimensions 

1084 # correspond to a task quantum. 

1085 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1086 

1087 # Allow the storage class definition to be read from the existing 

1088 # dataset type definition if present. 

1089 try: 

1090 current = registry.getDatasetType(taskDef.metadataDatasetName) 

1091 except KeyError: 

1092 # No previous definition so use the default. 

1093 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

1094 else: 

1095 storageClass = current.storageClass.name 

1096 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

1097 

1098 if taskDef.logOutputDatasetName is not None: 

1099 # Log output dimensions correspond to a task quantum. 

1100 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

1101 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}) 

1102 

1103 outputs.freeze() 

1104 

1105 inputs = makeDatasetTypesSet("inputs", is_input=True) 

1106 queryConstraints = NamedValueSet( 

1107 inputs[c.name] 

1108 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs")) 

1109 if not c.deferGraphConstraint 

1110 ) 

1111 

1112 return cls( 

1113 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

1114 initOutputs=initOutputs, 

1115 inputs=inputs, 

1116 queryConstraints=queryConstraints, 

1117 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

1118 outputs=outputs, 

1119 ) 

1120 

1121 

1122@dataclass(frozen=True) 

1123class PipelineDatasetTypes: 

1124 """An immutable struct that classifies the dataset types used in a 

1125 `Pipeline`. 

1126 """ 

1127 

1128 packagesDatasetName: ClassVar[str] = "packages" 

1129 """Name of a dataset type used to save package versions. 

1130 """ 

1131 

1132 initInputs: NamedValueSet[DatasetType] 

1133 """Dataset types that are needed as inputs in order to construct the Tasks 

1134 in this Pipeline. 

1135 

1136 This does not include dataset types that are produced when constructing 

1137 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1138 """ 

1139 

1140 initOutputs: NamedValueSet[DatasetType] 

1141 """Dataset types that may be written after constructing the Tasks in this 

1142 Pipeline. 

1143 

1144 This does not include dataset types that are also used as inputs when 

1145 constructing other Tasks in the Pipeline (these are classified as 

1146 `initIntermediates`). 

1147 """ 

1148 

1149 initIntermediates: NamedValueSet[DatasetType] 

1150 """Dataset types that are both used when constructing one or more Tasks 

1151 in the Pipeline and produced as a side-effect of constructing another 

1152 Task in the Pipeline. 

1153 """ 

1154 

1155 inputs: NamedValueSet[DatasetType] 

1156 """Dataset types that are regular inputs for the full pipeline. 

1157 

1158 If an input dataset needed for a Quantum cannot be found in the input 

1159 collection(s), that Quantum (and all dependent Quanta) will not be 

1160 produced. 

1161 """ 

1162 

1163 queryConstraints: NamedValueSet[DatasetType] 

1164 """Regular inputs that should be used as constraints on the initial 

1165 QuantumGraph generation data ID query, according to their tasks 

1166 (`NamedValueSet`). 

1167 """ 

1168 

1169 prerequisites: NamedValueSet[DatasetType] 

1170 """Dataset types that are prerequisite inputs for the full Pipeline. 

1171 

1172 Prerequisite inputs must exist in the input collection(s) before the 

1173 pipeline is run, but do not constrain the graph - if a prerequisite is 

1174 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1175 

1176 Prerequisite inputs are not resolved until the second stage of 

1177 QuantumGraph generation. 

1178 """ 

1179 

1180 intermediates: NamedValueSet[DatasetType] 

1181 """Dataset types that are output by one Task in the Pipeline and consumed 

1182 as inputs by one or more other Tasks in the Pipeline. 

1183 """ 

1184 

1185 outputs: NamedValueSet[DatasetType] 

1186 """Dataset types that are output by a Task in the Pipeline and not consumed 

1187 by any other Task in the Pipeline. 

1188 """ 

1189 

1190 byTask: Mapping[str, TaskDatasetTypes] 

1191 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1192 

1193 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1194 neither has been modified since the dataset types were extracted, of 

1195 course). 

1196 """ 

1197 

1198 @classmethod 

1199 def fromPipeline( 

1200 cls, 

1201 pipeline: Pipeline | Iterable[TaskDef], 

1202 *, 

1203 registry: Registry, 

1204 include_configs: bool = True, 

1205 include_packages: bool = True, 

1206 ) -> PipelineDatasetTypes: 

1207 """Extract and classify the dataset types from all tasks in a 

1208 `Pipeline`. 

1209 

1210 Parameters 

1211 ---------- 

1212 pipeline: `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1213 A collection of tasks that can be run together. 

1214 registry: `Registry` 

1215 Registry used to construct normalized 

1216 `~lsst.daf.butler.DatasetType` objects and retrieve those that are 

1217 incomplete. 

1218 include_configs : `bool`, optional 

1219 If `True` (default) include config dataset types as 

1220 ``initOutputs``. 

1221 include_packages : `bool`, optional 

1222 If `True` (default) include the dataset type for software package 

1223 versions in ``initOutputs``. 

1224 

1225 Returns 

1226 ------- 

1227 types: `PipelineDatasetTypes` 

1228 The dataset types used by this `Pipeline`. 

1229 

1230 Raises 

1231 ------ 

1232 ValueError 

1233 Raised if Tasks are inconsistent about which datasets are marked 

1234 prerequisite. This indicates that the Tasks cannot be run as part 

1235 of the same `Pipeline`. 

1236 """ 

1237 allInputs = NamedValueSet[DatasetType]() 

1238 allOutputs = NamedValueSet[DatasetType]() 

1239 allInitInputs = NamedValueSet[DatasetType]() 

1240 allInitOutputs = NamedValueSet[DatasetType]() 

1241 prerequisites = NamedValueSet[DatasetType]() 

1242 queryConstraints = NamedValueSet[DatasetType]() 

1243 byTask = dict() 

1244 if include_packages: 

1245 allInitOutputs.add( 

1246 DatasetType( 

1247 cls.packagesDatasetName, 

1248 registry.dimensions.empty, 

1249 storageClass="Packages", 

1250 ) 

1251 ) 

1252 # create a list of TaskDefs in case the input is a generator 

1253 pipeline = list(pipeline) 

1254 

1255 # collect all the output dataset types 

1256 typeStorageclassMap: dict[str, str] = {} 

1257 for taskDef in pipeline: 

1258 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1259 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1260 

1261 for taskDef in pipeline: 

1262 thisTask = TaskDatasetTypes.fromTaskDef( 

1263 taskDef, 

1264 registry=registry, 

1265 include_configs=include_configs, 

1266 storage_class_mapping=typeStorageclassMap, 

1267 ) 

1268 allInitInputs.update(thisTask.initInputs) 

1269 allInitOutputs.update(thisTask.initOutputs) 

1270 allInputs.update(thisTask.inputs) 

1271 # Inputs are query constraints if any task considers them a query 

1272 # constraint. 

1273 queryConstraints.update(thisTask.queryConstraints) 

1274 prerequisites.update(thisTask.prerequisites) 

1275 allOutputs.update(thisTask.outputs) 

1276 byTask[taskDef.label] = thisTask 

1277 if not prerequisites.isdisjoint(allInputs): 

1278 raise ValueError( 

1279 "{} marked as both prerequisites and regular inputs".format( 

1280 {dt.name for dt in allInputs & prerequisites} 

1281 ) 

1282 ) 

1283 if not prerequisites.isdisjoint(allOutputs): 

1284 raise ValueError( 

1285 "{} marked as both prerequisites and outputs".format( 

1286 {dt.name for dt in allOutputs & prerequisites} 

1287 ) 

1288 ) 

1289 # Make sure that components which are marked as inputs get treated as 

1290 # intermediates if there is an output which produces the composite 

1291 # containing the component 

1292 intermediateComponents = NamedValueSet[DatasetType]() 

1293 intermediateComposites = NamedValueSet[DatasetType]() 

1294 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1295 for dsType in allInputs: 

1296 # get the name of a possible component 

1297 name, component = dsType.nameAndComponent() 

1298 # if there is a component name, that means this is a component 

1299 # DatasetType, if there is an output which produces the parent of 

1300 # this component, treat this input as an intermediate 

1301 if component is not None: 

1302 # This needs to be in this if block, because someone might have 

1303 # a composite that is a pure input from existing data 

1304 if name in outputNameMapping: 

1305 intermediateComponents.add(dsType) 

1306 intermediateComposites.add(outputNameMapping[name]) 

1307 

1308 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1309 common = a.names & b.names 

1310 for name in common: 

1311 # Any compatibility is allowed. This function does not know 

1312 # if a dataset type is to be used for input or output. 

1313 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1314 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1315 

1316 checkConsistency(allInitInputs, allInitOutputs) 

1317 checkConsistency(allInputs, allOutputs) 

1318 checkConsistency(allInputs, intermediateComposites) 

1319 checkConsistency(allOutputs, intermediateComposites) 

1320 

1321 def frozen(s: Set[DatasetType]) -> NamedValueSet[DatasetType]: 

1322 assert isinstance(s, NamedValueSet) 

1323 s.freeze() 

1324 return s 

1325 

1326 inputs = frozen(allInputs - allOutputs - intermediateComponents) 

1327 

1328 return cls( 

1329 initInputs=frozen(allInitInputs - allInitOutputs), 

1330 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1331 initOutputs=frozen(allInitOutputs - allInitInputs), 

1332 inputs=inputs, 

1333 queryConstraints=frozen(queryConstraints & inputs), 

1334 # If there are storage class differences in inputs and outputs 

1335 # the intermediates have to choose priority. Here choose that 

1336 # inputs to tasks much match the requested storage class by 

1337 # applying the inputs over the top of the outputs. 

1338 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1339 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1340 prerequisites=frozen(prerequisites), 

1341 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1342 ) 

1343 

1344 @classmethod 

1345 def initOutputNames( 

1346 cls, 

1347 pipeline: Pipeline | Iterable[TaskDef], 

1348 *, 

1349 include_configs: bool = True, 

1350 include_packages: bool = True, 

1351 ) -> Iterator[str]: 

1352 """Return the names of dataset types ot task initOutputs, Configs, 

1353 and package versions for a pipeline. 

1354 

1355 Parameters 

1356 ---------- 

1357 pipeline: `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1358 A `Pipeline` instance or collection of `TaskDef` instances. 

1359 include_configs : `bool`, optional 

1360 If `True` (default) include config dataset types. 

1361 include_packages : `bool`, optional 

1362 If `True` (default) include the dataset type for package versions. 

1363 

1364 Yields 

1365 ------ 

1366 datasetTypeName : `str` 

1367 Name of the dataset type. 

1368 """ 

1369 if include_packages: 

1370 # Package versions dataset type 

1371 yield cls.packagesDatasetName 

1372 

1373 if isinstance(pipeline, Pipeline): 

1374 pipeline = pipeline.toExpandedPipeline() 

1375 

1376 for taskDef in pipeline: 

1377 # all task InitOutputs 

1378 for name in taskDef.connections.initOutputs: 

1379 attribute = getattr(taskDef.connections, name) 

1380 yield attribute.name 

1381 

1382 # config dataset name 

1383 if include_configs: 

1384 yield taskDef.configDatasetName