Coverage for python/lsst/pipe/base/pipeline.py: 23%

454 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-30 12:09 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Module defining Pipeline class and related methods. 

29""" 

30 

31from __future__ import annotations 

32 

33__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

34 

35import copy 

36import logging 

37import re 

38import urllib.parse 

39 

40# ------------------------------- 

41# Imports of standard modules -- 

42# ------------------------------- 

43from collections.abc import Callable, Generator, Iterable, Iterator, Mapping, Set 

44from dataclasses import dataclass 

45from types import MappingProxyType 

46from typing import TYPE_CHECKING, ClassVar, cast 

47 

48# ----------------------------- 

49# Imports for other modules -- 

50from lsst.daf.butler import DataCoordinate, DatasetType, DimensionUniverse, NamedValueSet, Registry 

51from lsst.resources import ResourcePath, ResourcePathExpression 

52from lsst.utils import doImportType 

53from lsst.utils.introspection import get_full_type_name 

54 

55from . import automatic_connection_constants as acc 

56from . import pipeline_graph, pipelineIR 

57from ._instrument import Instrument as PipeBaseInstrument 

58from .config import PipelineTaskConfig 

59from .connections import PipelineTaskConnections, iterConnections 

60from .connectionTypes import Input 

61from .pipelineTask import PipelineTask 

62 

63if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

64 from lsst.obs.base import Instrument 

65 from lsst.pex.config import Config 

66 

67# ---------------------------------- 

68# Local non-exported definitions -- 

69# ---------------------------------- 

70 

71_LOG = logging.getLogger(__name__) 

72 

73# ------------------------ 

74# Exported definitions -- 

75# ------------------------ 

76 

77 

78@dataclass 

79class LabelSpecifier: 

80 """A structure to specify a subset of labels to load 

81 

82 This structure may contain a set of labels to be used in subsetting a 

83 pipeline, or a beginning and end point. Beginning or end may be empty, 

84 in which case the range will be a half open interval. Unlike python 

85 iteration bounds, end bounds are *INCLUDED*. Note that range based 

86 selection is not well defined for pipelines that are not linear in nature, 

87 and correct behavior is not guaranteed, or may vary from run to run. 

88 """ 

89 

90 labels: set[str] | None = None 

91 begin: str | None = None 

92 end: str | None = None 

93 

94 def __post_init__(self) -> None: 

95 if self.labels is not None and (self.begin or self.end): 

96 raise ValueError( 

97 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

98 ) 

99 

100 

101class TaskDef: 

102 """TaskDef is a collection of information about task needed by Pipeline. 

103 

104 The information includes task name, configuration object and optional 

105 task class. This class is just a collection of attributes and it exposes 

106 all of them so that attributes could potentially be modified in place 

107 (e.g. if configuration needs extra overrides). 

108 

109 Attributes 

110 ---------- 

111 taskName : `str`, optional 

112 The fully-qualified `PipelineTask` class name. If not provided, 

113 ``taskClass`` must be. 

114 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional 

115 Instance of the configuration class corresponding to this task class, 

116 usually with all overrides applied. This config will be frozen. If 

117 not provided, ``taskClass`` must be provided and 

118 ``taskClass.ConfigClass()`` will be used. 

119 taskClass : `type`, optional 

120 `PipelineTask` class object; if provided and ``taskName`` is as well, 

121 the caller guarantees that they are consistent. If not provided, 

122 ``taskName`` is used to import the type. 

123 label : `str`, optional 

124 Task label, usually a short string unique in a pipeline. If not 

125 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

126 be used. 

127 connections : `PipelineTaskConnections`, optional 

128 Object that describes the dataset types used by the task. If not 

129 provided, one will be constructed from the given configuration. If 

130 provided, it is assumed that ``config`` has already been validated 

131 and frozen. 

132 """ 

133 

134 def __init__( 

135 self, 

136 taskName: str | None = None, 

137 config: PipelineTaskConfig | None = None, 

138 taskClass: type[PipelineTask] | None = None, 

139 label: str | None = None, 

140 connections: PipelineTaskConnections | None = None, 

141 ): 

142 if taskName is None: 

143 if taskClass is None: 

144 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

145 taskName = get_full_type_name(taskClass) 

146 elif taskClass is None: 

147 taskClass = doImportType(taskName) 

148 if config is None: 

149 if taskClass is None: 

150 raise ValueError("`taskClass` must be provided if `config` is not.") 

151 config = taskClass.ConfigClass() 

152 if label is None: 

153 if taskClass is None: 

154 raise ValueError("`taskClass` must be provided if `label` is not.") 

155 label = taskClass._DefaultName 

156 self.taskName = taskName 

157 if connections is None: 

158 # If we don't have connections yet, assume the config hasn't been 

159 # validated yet. 

160 try: 

161 config.validate() 

162 except Exception: 

163 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

164 raise 

165 config.freeze() 

166 connections = config.connections.ConnectionsClass(config=config) 

167 self.config = config 

168 self.taskClass = taskClass 

169 self.label = label 

170 self.connections = connections 

171 

172 @property 

173 def configDatasetName(self) -> str: 

174 """Name of a dataset type for configuration of this task (`str`)""" 

175 return acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.label) 

176 

177 @property 

178 def metadataDatasetName(self) -> str: 

179 """Name of a dataset type for metadata of this task (`str`)""" 

180 return self.makeMetadataDatasetName(self.label) 

181 

182 @classmethod 

183 def makeMetadataDatasetName(cls, label: str) -> str: 

184 """Construct the name of the dataset type for metadata for a task. 

185 

186 Parameters 

187 ---------- 

188 label : `str` 

189 Label for the task within its pipeline. 

190 

191 Returns 

192 ------- 

193 name : `str` 

194 Name of the task's metadata dataset type. 

195 """ 

196 return acc.METADATA_OUTPUT_TEMPLATE.format(label=label) 

197 

198 @property 

199 def logOutputDatasetName(self) -> str | None: 

200 """Name of a dataset type for log output from this task, `None` if 

201 logs are not to be saved (`str`) 

202 """ 

203 if self.config.saveLogOutput: 

204 return acc.LOG_OUTPUT_TEMPLATE.format(label=self.label) 

205 else: 

206 return None 

207 

208 def __str__(self) -> str: 

209 rep = "TaskDef(" + self.taskName 

210 if self.label: 

211 rep += ", label=" + self.label 

212 rep += ")" 

213 return rep 

214 

215 def __eq__(self, other: object) -> bool: 

216 if not isinstance(other, TaskDef): 

217 return False 

218 # This does not consider equality of configs when determining equality 

219 # as config equality is a difficult thing to define. Should be updated 

220 # after DM-27847 

221 return self.taskClass == other.taskClass and self.label == other.label 

222 

223 def __hash__(self) -> int: 

224 return hash((self.taskClass, self.label)) 

225 

226 @classmethod 

227 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef: 

228 """Unpickle pickle. Custom callable for unpickling. 

229 

230 All arguments are forwarded directly to the constructor; this 

231 trampoline is only needed because ``__reduce__`` callables can't be 

232 called with keyword arguments. 

233 """ 

234 return cls(taskName=taskName, config=config, label=label) 

235 

236 def __reduce__(self) -> tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], tuple[str, Config, str]]: 

237 return (self._unreduce, (self.taskName, self.config, self.label)) 

238 

239 

240class Pipeline: 

241 """A `Pipeline` is a representation of a series of tasks to run, and the 

242 configuration for those tasks. 

243 

244 Parameters 

245 ---------- 

246 description : `str` 

247 A description of that this pipeline does. 

248 """ 

249 

250 def __init__(self, description: str): 

251 pipeline_dict = {"description": description, "tasks": {}} 

252 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

253 

254 @classmethod 

255 def fromFile(cls, filename: str) -> Pipeline: 

256 """Load a pipeline defined in a pipeline yaml file. 

257 

258 Parameters 

259 ---------- 

260 filename: `str` 

261 A path that points to a pipeline defined in yaml format. This 

262 filename may also supply additional labels to be used in 

263 subsetting the loaded Pipeline. These labels are separated from 

264 the path by a ``#``, and may be specified as a comma separated 

265 list, or a range denoted as beginning..end. Beginning or end may 

266 be empty, in which case the range will be a half open interval. 

267 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

268 that range based selection is not well defined for pipelines that 

269 are not linear in nature, and correct behavior is not guaranteed, 

270 or may vary from run to run. 

271 

272 Returns 

273 ------- 

274 pipeline: `Pipeline` 

275 The pipeline loaded from specified location with appropriate (if 

276 any) subsetting. 

277 

278 Notes 

279 ----- 

280 This method attempts to prune any contracts that contain labels which 

281 are not in the declared subset of labels. This pruning is done using a 

282 string based matching due to the nature of contracts and may prune more 

283 than it should. 

284 """ 

285 return cls.from_uri(filename) 

286 

287 @classmethod 

288 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

289 """Load a pipeline defined in a pipeline yaml file at a location 

290 specified by a URI. 

291 

292 Parameters 

293 ---------- 

294 uri : convertible to `~lsst.resources.ResourcePath` 

295 If a string is supplied this should be a URI path that points to a 

296 pipeline defined in yaml format, either as a direct path to the 

297 yaml file, or as a directory containing a ``pipeline.yaml`` file 

298 the form used by `write_to_uri` with ``expand=True``). This uri may 

299 also supply additional labels to be used in subsetting the loaded 

300 `Pipeline`. These labels are separated from the path by a ``#``, 

301 and may be specified as a comma separated list, or a range denoted 

302 as beginning..end. Beginning or end may be empty, in which case the 

303 range will be a half open interval. Unlike python iteration bounds, 

304 end bounds are *INCLUDED*. Note that range based selection is not 

305 well defined for pipelines that are not linear in nature, and 

306 correct behavior is not guaranteed, or may vary from run to run. 

307 The same specifiers can be used with a 

308 `~lsst.resources.ResourcePath` object, by being the sole contents 

309 in the fragments attribute. 

310 

311 Returns 

312 ------- 

313 pipeline : `Pipeline` 

314 The pipeline loaded from specified location with appropriate (if 

315 any) subsetting. 

316 

317 Notes 

318 ----- 

319 This method attempts to prune any contracts that contain labels which 

320 are not in the declared subset of labels. This pruning is done using a 

321 string based matching due to the nature of contracts and may prune more 

322 than it should. 

323 """ 

324 # Split up the uri and any labels that were supplied 

325 uri, label_specifier = cls._parse_file_specifier(uri) 

326 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

327 

328 # If there are labels supplied, only keep those 

329 if label_specifier is not None: 

330 pipeline = pipeline.subsetFromLabels(label_specifier) 

331 return pipeline 

332 

333 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

334 """Subset a pipeline to contain only labels specified in labelSpecifier 

335 

336 Parameters 

337 ---------- 

338 labelSpecifier : `labelSpecifier` 

339 Object containing labels that describes how to subset a pipeline. 

340 

341 Returns 

342 ------- 

343 pipeline : `Pipeline` 

344 A new pipeline object that is a subset of the old pipeline 

345 

346 Raises 

347 ------ 

348 ValueError 

349 Raised if there is an issue with specified labels 

350 

351 Notes 

352 ----- 

353 This method attempts to prune any contracts that contain labels which 

354 are not in the declared subset of labels. This pruning is done using a 

355 string based matching due to the nature of contracts and may prune more 

356 than it should. 

357 """ 

358 # Labels supplied as a set 

359 if labelSpecifier.labels: 

360 labelSet = labelSpecifier.labels 

361 # Labels supplied as a range, first create a list of all the labels 

362 # in the pipeline sorted according to task dependency. Then only 

363 # keep labels that lie between the supplied bounds 

364 else: 

365 # Create a copy of the pipeline to use when assessing the label 

366 # ordering. Use a dict for fast searching while preserving order. 

367 # Remove contracts so they do not fail in the expansion step. This 

368 # is needed because a user may only configure the tasks they intend 

369 # to run, which may cause some contracts to fail if they will later 

370 # be dropped 

371 pipeline = copy.deepcopy(self) 

372 pipeline._pipelineIR.contracts = [] 

373 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

374 

375 # Verify the bounds are in the labels 

376 if labelSpecifier.begin is not None: 

377 if labelSpecifier.begin not in labels: 

378 raise ValueError( 

379 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition" 

380 ) 

381 if labelSpecifier.end is not None: 

382 if labelSpecifier.end not in labels: 

383 raise ValueError( 

384 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

385 ) 

386 

387 labelSet = set() 

388 for label in labels: 

389 if labelSpecifier.begin is not None: 

390 if label != labelSpecifier.begin: 

391 continue 

392 else: 

393 labelSpecifier.begin = None 

394 labelSet.add(label) 

395 if labelSpecifier.end is not None and label == labelSpecifier.end: 

396 break 

397 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

398 

399 @staticmethod 

400 def _parse_file_specifier(uri: ResourcePathExpression) -> tuple[ResourcePath, LabelSpecifier | None]: 

401 """Split appart a uri and any possible label subsets""" 

402 if isinstance(uri, str): 

403 # This is to support legacy pipelines during transition 

404 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

405 if num_replace: 

406 raise ValueError( 

407 f"The pipeline file {uri} seems to use the legacy :" 

408 " to separate labels, please use # instead." 

409 ) 

410 if uri.count("#") > 1: 

411 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

412 # Everything else can be converted directly to ResourcePath. 

413 uri = ResourcePath(uri) 

414 label_subset = uri.fragment or None 

415 

416 specifier: LabelSpecifier | None 

417 if label_subset is not None: 

418 label_subset = urllib.parse.unquote(label_subset) 

419 args: dict[str, set[str] | str | None] 

420 # labels supplied as a list 

421 if "," in label_subset: 

422 if ".." in label_subset: 

423 raise ValueError( 

424 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

425 ) 

426 args = {"labels": set(label_subset.split(","))} 

427 # labels supplied as a range 

428 elif ".." in label_subset: 

429 # Try to de-structure the labelSubset, this will fail if more 

430 # than one range is specified 

431 begin, end, *rest = label_subset.split("..") 

432 if rest: 

433 raise ValueError("Only one range can be specified when loading a pipeline") 

434 args = {"begin": begin if begin else None, "end": end if end else None} 

435 # Assume anything else is a single label 

436 else: 

437 args = {"labels": {label_subset}} 

438 

439 # MyPy doesn't like how cavalier kwarg construction is with types. 

440 specifier = LabelSpecifier(**args) # type: ignore 

441 else: 

442 specifier = None 

443 

444 return uri, specifier 

445 

446 @classmethod 

447 def fromString(cls, pipeline_string: str) -> Pipeline: 

448 """Create a pipeline from string formatted as a pipeline document. 

449 

450 Parameters 

451 ---------- 

452 pipeline_string : `str` 

453 A string that is formatted according like a pipeline document 

454 

455 Returns 

456 ------- 

457 pipeline: `Pipeline` 

458 """ 

459 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

460 return pipeline 

461 

462 @classmethod 

463 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

464 """Create a pipeline from an already created `PipelineIR` object. 

465 

466 Parameters 

467 ---------- 

468 deserialized_pipeline: `PipelineIR` 

469 An already created pipeline intermediate representation object 

470 

471 Returns 

472 ------- 

473 pipeline: `Pipeline` 

474 """ 

475 pipeline = cls.__new__(cls) 

476 pipeline._pipelineIR = deserialized_pipeline 

477 return pipeline 

478 

479 @classmethod 

480 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

481 """Create a new pipeline by copying an already existing `Pipeline`. 

482 

483 Parameters 

484 ---------- 

485 pipeline: `Pipeline` 

486 An already created pipeline intermediate representation object 

487 

488 Returns 

489 ------- 

490 pipeline: `Pipeline` 

491 """ 

492 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

493 

494 def __str__(self) -> str: 

495 return str(self._pipelineIR) 

496 

497 def mergePipeline(self, pipeline: Pipeline) -> None: 

498 """Merge another in-memory `Pipeline` object into this one. 

499 

500 This merges another pipeline into this object, as if it were declared 

501 in the import block of the yaml definition of this pipeline. This 

502 modifies this pipeline in place. 

503 

504 Parameters 

505 ---------- 

506 pipeline : `Pipeline` 

507 The `Pipeline` object that is to be merged into this object. 

508 """ 

509 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,)) 

510 

511 def addLabelToSubset(self, subset: str, label: str) -> None: 

512 """Add a task label from the specified subset. 

513 

514 Parameters 

515 ---------- 

516 subset : `str` 

517 The labeled subset to modify 

518 label : `str` 

519 The task label to add to the specified subset. 

520 

521 Raises 

522 ------ 

523 ValueError 

524 Raised if the specified subset does not exist within the pipeline. 

525 Raised if the specified label does not exist within the pipeline. 

526 """ 

527 if label not in self._pipelineIR.tasks: 

528 raise ValueError(f"Label {label} does not appear within the pipeline") 

529 if subset not in self._pipelineIR.labeled_subsets: 

530 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

531 self._pipelineIR.labeled_subsets[subset].subset.add(label) 

532 

533 def removeLabelFromSubset(self, subset: str, label: str) -> None: 

534 """Remove a task label from the specified subset. 

535 

536 Parameters 

537 ---------- 

538 subset : `str` 

539 The labeled subset to modify 

540 label : `str` 

541 The task label to remove from the specified subset. 

542 

543 Raises 

544 ------ 

545 ValueError 

546 Raised if the specified subset does not exist in the pipeline. 

547 Raised if the specified label does not exist within the specified 

548 subset. 

549 """ 

550 if subset not in self._pipelineIR.labeled_subsets: 

551 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

552 if label not in self._pipelineIR.labeled_subsets[subset].subset: 

553 raise ValueError(f"Label {label} does not appear within the pipeline") 

554 self._pipelineIR.labeled_subsets[subset].subset.remove(label) 

555 

556 def findSubsetsWithLabel(self, label: str) -> set[str]: 

557 """Find any subsets which may contain the specified label. 

558 

559 This function returns the name of subsets which return the specified 

560 label. May return an empty set if there are no subsets, or no subsets 

561 containing the specified label. 

562 

563 Parameters 

564 ---------- 

565 label : `str` 

566 The task label to use in membership check 

567 

568 Returns 

569 ------- 

570 subsets : `set` of `str` 

571 Returns a set (possibly empty) of subsets names which contain the 

572 specified label. 

573 

574 Raises 

575 ------ 

576 ValueError 

577 Raised if the specified label does not exist within this pipeline. 

578 """ 

579 results = set() 

580 if label not in self._pipelineIR.tasks: 

581 raise ValueError(f"Label {label} does not appear within the pipeline") 

582 for subset in self._pipelineIR.labeled_subsets.values(): 

583 if label in subset.subset: 

584 results.add(subset.label) 

585 return results 

586 

587 def addInstrument(self, instrument: Instrument | str) -> None: 

588 """Add an instrument to the pipeline, or replace an instrument that is 

589 already defined. 

590 

591 Parameters 

592 ---------- 

593 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

594 Either a derived class object of a `lsst.daf.butler.instrument` or 

595 a string corresponding to a fully qualified 

596 `lsst.daf.butler.instrument` name. 

597 """ 

598 if isinstance(instrument, str): 

599 pass 

600 else: 

601 # TODO: assume that this is a subclass of Instrument, no type 

602 # checking 

603 instrument = get_full_type_name(instrument) 

604 self._pipelineIR.instrument = instrument 

605 

606 def getInstrument(self) -> str | None: 

607 """Get the instrument from the pipeline. 

608 

609 Returns 

610 ------- 

611 instrument : `str`, or None 

612 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

613 name, or None if the pipeline does not have an instrument. 

614 """ 

615 return self._pipelineIR.instrument 

616 

617 def get_data_id(self, universe: DimensionUniverse) -> DataCoordinate: 

618 """Return a data ID with all dimension constraints embedded in the 

619 pipeline. 

620 

621 Parameters 

622 ---------- 

623 universe : `lsst.daf.butler.DimensionUniverse` 

624 Object that defines all dimensions. 

625 

626 Returns 

627 ------- 

628 data_id : `lsst.daf.butler.DataCoordinate` 

629 Data ID with all dimension constraints embedded in the 

630 pipeline. 

631 """ 

632 instrument_class_name = self._pipelineIR.instrument 

633 if instrument_class_name is not None: 

634 instrument_class = cast(PipeBaseInstrument, doImportType(instrument_class_name)) 

635 if instrument_class is not None: 

636 return DataCoordinate.standardize(instrument=instrument_class.getName(), universe=universe) 

637 return DataCoordinate.make_empty(universe) 

638 

639 def addTask(self, task: type[PipelineTask] | str, label: str) -> None: 

640 """Add a new task to the pipeline, or replace a task that is already 

641 associated with the supplied label. 

642 

643 Parameters 

644 ---------- 

645 task: `PipelineTask` or `str` 

646 Either a derived class object of a `PipelineTask` or a string 

647 corresponding to a fully qualified `PipelineTask` name. 

648 label: `str` 

649 A label that is used to identify the `PipelineTask` being added 

650 """ 

651 if isinstance(task, str): 

652 taskName = task 

653 elif issubclass(task, PipelineTask): 

654 taskName = get_full_type_name(task) 

655 else: 

656 raise ValueError( 

657 "task must be either a child class of PipelineTask or a string containing" 

658 " a fully qualified name to one" 

659 ) 

660 if not label: 

661 # in some cases (with command line-generated pipeline) tasks can 

662 # be defined without label which is not acceptable, use task 

663 # _DefaultName in that case 

664 if isinstance(task, str): 

665 task_class = cast(PipelineTask, doImportType(task)) 

666 label = task_class._DefaultName 

667 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

668 

669 def removeTask(self, label: str) -> None: 

670 """Remove a task from the pipeline. 

671 

672 Parameters 

673 ---------- 

674 label : `str` 

675 The label used to identify the task that is to be removed 

676 

677 Raises 

678 ------ 

679 KeyError 

680 If no task with that label exists in the pipeline 

681 

682 """ 

683 self._pipelineIR.tasks.pop(label) 

684 

685 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

686 """Apply single config override. 

687 

688 Parameters 

689 ---------- 

690 label : `str` 

691 Label of the task. 

692 key: `str` 

693 Fully-qualified field name. 

694 value : object 

695 Value to be given to a field. 

696 """ 

697 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

698 

699 def addConfigFile(self, label: str, filename: str) -> None: 

700 """Add overrides from a specified file. 

701 

702 Parameters 

703 ---------- 

704 label : `str` 

705 The label used to identify the task associated with config to 

706 modify 

707 filename : `str` 

708 Path to the override file. 

709 """ 

710 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

711 

712 def addConfigPython(self, label: str, pythonString: str) -> None: 

713 """Add Overrides by running a snippet of python code against a config. 

714 

715 Parameters 

716 ---------- 

717 label : `str` 

718 The label used to identity the task associated with config to 

719 modify. 

720 pythonString: `str` 

721 A string which is valid python code to be executed. This is done 

722 with config as the only local accessible value. 

723 """ 

724 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

725 

726 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

727 if label == "parameters": 

728 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

729 if newConfig.file: 

730 raise ValueError("Setting parameters section with config file is not supported") 

731 if newConfig.python: 

732 raise ValueError("Setting parameters section using python block in unsupported") 

733 return 

734 if label not in self._pipelineIR.tasks: 

735 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

736 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

737 

738 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

739 """Write the pipeline to a file or directory. 

740 

741 Parameters 

742 ---------- 

743 uri : convertible to `~lsst.resources.ResourcePath` 

744 URI to write to; may have any scheme with 

745 `~lsst.resources.ResourcePath` write support or no scheme for a 

746 local file/directory. Should have a ``.yaml`` extension. 

747 """ 

748 self._pipelineIR.write_to_uri(uri) 

749 

750 def to_graph(self, registry: Registry | None = None) -> pipeline_graph.PipelineGraph: 

751 """Construct a pipeline graph from this pipeline. 

752 

753 Constructing a graph applies all configuration overrides, freezes all 

754 configuration, checks all contracts, and checks for dataset type 

755 consistency between tasks (as much as possible without access to a data 

756 repository). It cannot be reversed. 

757 

758 Parameters 

759 ---------- 

760 registry : `lsst.daf.butler.Registry`, optional 

761 Data repository client. If provided, the graph's dataset types 

762 and dimensions will be resolved (see `PipelineGraph.resolve`). 

763 

764 Returns 

765 ------- 

766 graph : `pipeline_graph.PipelineGraph` 

767 Representation of the pipeline as a graph. 

768 """ 

769 instrument_class_name = self._pipelineIR.instrument 

770 data_id = {} 

771 if instrument_class_name is not None: 

772 instrument_class: type[Instrument] = doImportType(instrument_class_name) 

773 if instrument_class is not None: 

774 data_id["instrument"] = instrument_class.getName() 

775 graph = pipeline_graph.PipelineGraph(data_id=data_id) 

776 graph.description = self._pipelineIR.description 

777 for label in self._pipelineIR.tasks: 

778 self._add_task_to_graph(label, graph) 

779 if self._pipelineIR.contracts is not None: 

780 label_to_config = {x.label: x.config for x in graph.tasks.values()} 

781 for contract in self._pipelineIR.contracts: 

782 # execute this in its own line so it can raise a good error 

783 # message if there was problems with the eval 

784 success = eval(contract.contract, None, label_to_config) 

785 if not success: 

786 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

787 raise pipelineIR.ContractError( 

788 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

789 ) 

790 for label, subset in self._pipelineIR.labeled_subsets.items(): 

791 graph.add_task_subset( 

792 label, subset.subset, subset.description if subset.description is not None else "" 

793 ) 

794 graph.sort() 

795 if registry is not None: 

796 graph.resolve(registry) 

797 return graph 

798 

799 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

800 r"""Return a generator of `TaskDef`\s which can be used to create 

801 quantum graphs. 

802 

803 Returns 

804 ------- 

805 generator : generator of `TaskDef` 

806 The generator returned will be the sorted iterator of tasks which 

807 are to be used in constructing a quantum graph. 

808 

809 Raises 

810 ------ 

811 NotImplementedError 

812 If a dataId is supplied in a config block. This is in place for 

813 future use 

814 """ 

815 yield from self.to_graph()._iter_task_defs() 

816 

817 def _add_task_to_graph(self, label: str, graph: pipeline_graph.PipelineGraph) -> None: 

818 """Add a single task from this pipeline to a pipeline graph that is 

819 under construction. 

820 

821 Parameters 

822 ---------- 

823 label : `str` 

824 Label for the task to be added. 

825 graph : `pipeline_graph.PipelineGraph` 

826 Graph to add the task to. 

827 """ 

828 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

829 raise NameError(f"Label {label} does not appear in this pipeline") 

830 taskClass: type[PipelineTask] = doImportType(taskIR.klass) 

831 config = taskClass.ConfigClass() 

832 instrument: PipeBaseInstrument | None = None 

833 if (instrumentName := self._pipelineIR.instrument) is not None: 

834 instrument_cls: type = doImportType(instrumentName) 

835 instrument = instrument_cls() 

836 config.applyConfigOverrides( 

837 instrument, 

838 getattr(taskClass, "_DefaultName", ""), 

839 taskIR.config, 

840 self._pipelineIR.parameters, 

841 label, 

842 ) 

843 graph.add_task(label, taskClass, config) 

844 

845 def __iter__(self) -> Generator[TaskDef, None, None]: 

846 return self.toExpandedPipeline() 

847 

848 def __getitem__(self, item: str) -> TaskDef: 

849 # Making a whole graph and then making a TaskDef from that is pretty 

850 # backwards, but I'm hoping to deprecate this method shortly in favor 

851 # of making the graph explicitly and working with its node objects. 

852 graph = pipeline_graph.PipelineGraph() 

853 self._add_task_to_graph(item, graph) 

854 (result,) = graph._iter_task_defs() 

855 return result 

856 

857 def __len__(self) -> int: 

858 return len(self._pipelineIR.tasks) 

859 

860 def __eq__(self, other: object) -> bool: 

861 if not isinstance(other, Pipeline): 

862 return False 

863 elif self._pipelineIR == other._pipelineIR: 

864 # Shortcut: if the IR is the same, the expanded pipeline must be 

865 # the same as well. But the converse is not true. 

866 return True 

867 else: 

868 self_expanded = {td.label: (td.taskClass,) for td in self} 

869 other_expanded = {td.label: (td.taskClass,) for td in other} 

870 if self_expanded != other_expanded: 

871 return False 

872 # After DM-27847, we should compare configuration here, or better, 

873 # delegated to TaskDef.__eq__ after making that compare configurations. 

874 raise NotImplementedError( 

875 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847." 

876 ) 

877 

878 

879@dataclass(frozen=True) 

880class TaskDatasetTypes: 

881 """An immutable struct that extracts and classifies the dataset types used 

882 by a `PipelineTask` 

883 """ 

884 

885 initInputs: NamedValueSet[DatasetType] 

886 """Dataset types that are needed as inputs in order to construct this Task. 

887 

888 Task-level `initInputs` may be classified as either 

889 `~PipelineDatasetTypes.initInputs` or 

890 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

891 """ 

892 

893 initOutputs: NamedValueSet[DatasetType] 

894 """Dataset types that may be written after constructing this Task. 

895 

896 Task-level `initOutputs` may be classified as either 

897 `~PipelineDatasetTypes.initOutputs` or 

898 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

899 """ 

900 

901 inputs: NamedValueSet[DatasetType] 

902 """Dataset types that are regular inputs to this Task. 

903 

904 If an input dataset needed for a Quantum cannot be found in the input 

905 collection(s) or produced by another Task in the Pipeline, that Quantum 

906 (and all dependent Quanta) will not be produced. 

907 

908 Task-level `inputs` may be classified as either 

909 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

910 at the Pipeline level. 

911 """ 

912 

913 queryConstraints: NamedValueSet[DatasetType] 

914 """Regular inputs that should not be used as constraints on the initial 

915 QuantumGraph generation data ID query, according to their tasks 

916 (`NamedValueSet`). 

917 """ 

918 

919 prerequisites: NamedValueSet[DatasetType] 

920 """Dataset types that are prerequisite inputs to this Task. 

921 

922 Prerequisite inputs must exist in the input collection(s) before the 

923 pipeline is run, but do not constrain the graph - if a prerequisite is 

924 missing for a Quantum, `PrerequisiteMissingError` is raised. 

925 

926 Prerequisite inputs are not resolved until the second stage of 

927 QuantumGraph generation. 

928 """ 

929 

930 outputs: NamedValueSet[DatasetType] 

931 """Dataset types that are produced by this Task. 

932 

933 Task-level `outputs` may be classified as either 

934 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

935 at the Pipeline level. 

936 """ 

937 

938 @classmethod 

939 def fromTaskDef( 

940 cls, 

941 taskDef: TaskDef, 

942 *, 

943 registry: Registry, 

944 include_configs: bool = True, 

945 storage_class_mapping: Mapping[str, str] | None = None, 

946 ) -> TaskDatasetTypes: 

947 """Extract and classify the dataset types from a single `PipelineTask`. 

948 

949 Parameters 

950 ---------- 

951 taskDef: `TaskDef` 

952 An instance of a `TaskDef` class for a particular `PipelineTask`. 

953 registry: `Registry` 

954 Registry used to construct normalized 

955 `~lsst.daf.butler.DatasetType` objects and retrieve those that are 

956 incomplete. 

957 include_configs : `bool`, optional 

958 If `True` (default) include config dataset types as 

959 ``initOutputs``. 

960 storage_class_mapping : `~collections.abc.Mapping` of `str` to \ 

961 `~lsst.daf.butler.StorageClass`, optional 

962 If a taskdef contains a component dataset type that is unknown 

963 to the registry, its parent `~lsst.daf.butler.StorageClass` will 

964 be looked up in this mapping if it is supplied. If the mapping does 

965 not contain the composite dataset type, or the mapping is not 

966 supplied an exception will be raised. 

967 

968 Returns 

969 ------- 

970 types: `TaskDatasetTypes` 

971 The dataset types used by this task. 

972 

973 Raises 

974 ------ 

975 ValueError 

976 Raised if dataset type connection definition differs from 

977 registry definition. 

978 LookupError 

979 Raised if component parent StorageClass could not be determined 

980 and storage_class_mapping does not contain the composite type, or 

981 is set to None. 

982 """ 

983 

984 def makeDatasetTypesSet( 

985 connectionType: str, 

986 is_input: bool, 

987 freeze: bool = True, 

988 ) -> NamedValueSet[DatasetType]: 

989 """Construct a set of true `~lsst.daf.butler.DatasetType` objects. 

990 

991 Parameters 

992 ---------- 

993 connectionType : `str` 

994 Name of the connection type to produce a set for, corresponds 

995 to an attribute of type `list` on the connection class instance 

996 is_input : `bool` 

997 These are input dataset types, else they are output dataset 

998 types. 

999 freeze : `bool`, optional 

1000 If `True`, call `NamedValueSet.freeze` on the object returned. 

1001 

1002 Returns 

1003 ------- 

1004 datasetTypes : `NamedValueSet` 

1005 A set of all datasetTypes which correspond to the input 

1006 connection type specified in the connection class of this 

1007 `PipelineTask` 

1008 

1009 Raises 

1010 ------ 

1011 ValueError 

1012 Raised if dataset type connection definition differs from 

1013 registry definition. 

1014 LookupError 

1015 Raised if component parent StorageClass could not be determined 

1016 and storage_class_mapping does not contain the composite type, 

1017 or is set to None. 

1018 

1019 Notes 

1020 ----- 

1021 This function is a closure over the variables ``registry`` and 

1022 ``taskDef``, and ``storage_class_mapping``. 

1023 """ 

1024 datasetTypes = NamedValueSet[DatasetType]() 

1025 for c in iterConnections(taskDef.connections, connectionType): 

1026 dimensions = set(getattr(c, "dimensions", set())) 

1027 if "skypix" in dimensions: 

1028 try: 

1029 datasetType = registry.getDatasetType(c.name) 

1030 except LookupError as err: 

1031 raise LookupError( 

1032 f"DatasetType '{c.name}' referenced by " 

1033 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

1034 "placeholder, but does not already exist in the registry. " 

1035 "Note that reference catalog names are now used as the dataset " 

1036 "type name instead of 'ref_cat'." 

1037 ) from err 

1038 rest1 = set(registry.dimensions.conform(dimensions - {"skypix"}).names) 

1039 rest2 = datasetType.dimensions.names - datasetType.dimensions.skypix.names 

1040 if rest1 != rest2: 

1041 raise ValueError( 

1042 f"Non-skypix dimensions for dataset type {c.name} declared in " 

1043 f"connections ({rest1}) are inconsistent with those in " 

1044 f"registry's version of this dataset ({rest2})." 

1045 ) 

1046 else: 

1047 # Component dataset types are not explicitly in the 

1048 # registry. This complicates consistency checks with 

1049 # registry and requires we work out the composite storage 

1050 # class. 

1051 registryDatasetType = None 

1052 try: 

1053 registryDatasetType = registry.getDatasetType(c.name) 

1054 except KeyError: 

1055 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

1056 if componentName: 

1057 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

1058 raise LookupError( 

1059 "Component parent class cannot be determined, and " 

1060 "composite name was not in storage class mapping, or no " 

1061 "storage_class_mapping was supplied" 

1062 ) from None 

1063 else: 

1064 parentStorageClass = storage_class_mapping[compositeName] 

1065 else: 

1066 parentStorageClass = None 

1067 datasetType = c.makeDatasetType( 

1068 registry.dimensions, parentStorageClass=parentStorageClass 

1069 ) 

1070 registryDatasetType = datasetType 

1071 else: 

1072 datasetType = c.makeDatasetType( 

1073 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

1074 ) 

1075 

1076 if registryDatasetType and datasetType != registryDatasetType: 

1077 # The dataset types differ but first check to see if 

1078 # they are compatible before raising. 

1079 if is_input: 

1080 # This DatasetType must be compatible on get. 

1081 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

1082 else: 

1083 # Has to be able to be converted to expect type 

1084 # on put. 

1085 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

1086 if is_compatible: 

1087 # For inputs we want the pipeline to use the 

1088 # pipeline definition, for outputs it should use 

1089 # the registry definition. 

1090 if not is_input: 

1091 datasetType = registryDatasetType 

1092 _LOG.debug( 

1093 "Dataset types differ (task %s != registry %s) but are compatible" 

1094 " for %s in %s.", 

1095 datasetType, 

1096 registryDatasetType, 

1097 "input" if is_input else "output", 

1098 taskDef.label, 

1099 ) 

1100 else: 

1101 try: 

1102 # Explicitly check for storage class just to 

1103 # make more specific message. 

1104 _ = datasetType.storageClass 

1105 except KeyError: 

1106 raise ValueError( 

1107 "Storage class does not exist for supplied dataset type " 

1108 f"{datasetType} for {taskDef.label}." 

1109 ) from None 

1110 raise ValueError( 

1111 f"Supplied dataset type ({datasetType}) inconsistent with " 

1112 f"registry definition ({registryDatasetType}) " 

1113 f"for {taskDef.label}." 

1114 ) 

1115 datasetTypes.add(datasetType) 

1116 if freeze: 

1117 datasetTypes.freeze() 

1118 return datasetTypes 

1119 

1120 # optionally add initOutput dataset for config 

1121 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

1122 if include_configs: 

1123 initOutputs.add( 

1124 DatasetType( 

1125 taskDef.configDatasetName, 

1126 registry.dimensions.empty, 

1127 storageClass=acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS, 

1128 ) 

1129 ) 

1130 initOutputs.freeze() 

1131 

1132 # optionally add output dataset for metadata 

1133 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

1134 

1135 # Metadata is supposed to be of the TaskMetadata type, its dimensions 

1136 # correspond to a task quantum. 

1137 dimensions = registry.dimensions.conform(taskDef.connections.dimensions) 

1138 

1139 # Allow the storage class definition to be read from the existing 

1140 # dataset type definition if present. 

1141 try: 

1142 current = registry.getDatasetType(taskDef.metadataDatasetName) 

1143 except KeyError: 

1144 # No previous definition so use the default. 

1145 storageClass = acc.METADATA_OUTPUT_STORAGE_CLASS 

1146 else: 

1147 storageClass = current.storageClass.name 

1148 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

1149 

1150 if taskDef.logOutputDatasetName is not None: 

1151 # Log output dimensions correspond to a task quantum. 

1152 dimensions = registry.dimensions.conform(taskDef.connections.dimensions) 

1153 outputs.update( 

1154 { 

1155 DatasetType( 

1156 taskDef.logOutputDatasetName, 

1157 dimensions, 

1158 acc.LOG_OUTPUT_STORAGE_CLASS, 

1159 ) 

1160 } 

1161 ) 

1162 

1163 outputs.freeze() 

1164 

1165 inputs = makeDatasetTypesSet("inputs", is_input=True) 

1166 queryConstraints = NamedValueSet( 

1167 inputs[c.name] 

1168 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs")) 

1169 if not c.deferGraphConstraint 

1170 ) 

1171 

1172 return cls( 

1173 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

1174 initOutputs=initOutputs, 

1175 inputs=inputs, 

1176 queryConstraints=queryConstraints, 

1177 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

1178 outputs=outputs, 

1179 ) 

1180 

1181 

1182@dataclass(frozen=True) 

1183class PipelineDatasetTypes: 

1184 """An immutable struct that classifies the dataset types used in a 

1185 `Pipeline`. 

1186 """ 

1187 

1188 packagesDatasetName: ClassVar[str] = acc.PACKAGES_INIT_OUTPUT_NAME 

1189 """Name of a dataset type used to save package versions. 

1190 """ 

1191 

1192 initInputs: NamedValueSet[DatasetType] 

1193 """Dataset types that are needed as inputs in order to construct the Tasks 

1194 in this Pipeline. 

1195 

1196 This does not include dataset types that are produced when constructing 

1197 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1198 """ 

1199 

1200 initOutputs: NamedValueSet[DatasetType] 

1201 """Dataset types that may be written after constructing the Tasks in this 

1202 Pipeline. 

1203 

1204 This does not include dataset types that are also used as inputs when 

1205 constructing other Tasks in the Pipeline (these are classified as 

1206 `initIntermediates`). 

1207 """ 

1208 

1209 initIntermediates: NamedValueSet[DatasetType] 

1210 """Dataset types that are both used when constructing one or more Tasks 

1211 in the Pipeline and produced as a side-effect of constructing another 

1212 Task in the Pipeline. 

1213 """ 

1214 

1215 inputs: NamedValueSet[DatasetType] 

1216 """Dataset types that are regular inputs for the full pipeline. 

1217 

1218 If an input dataset needed for a Quantum cannot be found in the input 

1219 collection(s), that Quantum (and all dependent Quanta) will not be 

1220 produced. 

1221 """ 

1222 

1223 queryConstraints: NamedValueSet[DatasetType] 

1224 """Regular inputs that should be used as constraints on the initial 

1225 QuantumGraph generation data ID query, according to their tasks 

1226 (`NamedValueSet`). 

1227 """ 

1228 

1229 prerequisites: NamedValueSet[DatasetType] 

1230 """Dataset types that are prerequisite inputs for the full Pipeline. 

1231 

1232 Prerequisite inputs must exist in the input collection(s) before the 

1233 pipeline is run, but do not constrain the graph - if a prerequisite is 

1234 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1235 

1236 Prerequisite inputs are not resolved until the second stage of 

1237 QuantumGraph generation. 

1238 """ 

1239 

1240 intermediates: NamedValueSet[DatasetType] 

1241 """Dataset types that are output by one Task in the Pipeline and consumed 

1242 as inputs by one or more other Tasks in the Pipeline. 

1243 """ 

1244 

1245 outputs: NamedValueSet[DatasetType] 

1246 """Dataset types that are output by a Task in the Pipeline and not consumed 

1247 by any other Task in the Pipeline. 

1248 """ 

1249 

1250 byTask: Mapping[str, TaskDatasetTypes] 

1251 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1252 

1253 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1254 neither has been modified since the dataset types were extracted, of 

1255 course). 

1256 """ 

1257 

1258 @classmethod 

1259 def fromPipeline( 

1260 cls, 

1261 pipeline: Pipeline | Iterable[TaskDef], 

1262 *, 

1263 registry: Registry, 

1264 include_configs: bool = True, 

1265 include_packages: bool = True, 

1266 ) -> PipelineDatasetTypes: 

1267 """Extract and classify the dataset types from all tasks in a 

1268 `Pipeline`. 

1269 

1270 Parameters 

1271 ---------- 

1272 pipeline: `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1273 A collection of tasks that can be run together. 

1274 registry: `Registry` 

1275 Registry used to construct normalized 

1276 `~lsst.daf.butler.DatasetType` objects and retrieve those that are 

1277 incomplete. 

1278 include_configs : `bool`, optional 

1279 If `True` (default) include config dataset types as 

1280 ``initOutputs``. 

1281 include_packages : `bool`, optional 

1282 If `True` (default) include the dataset type for software package 

1283 versions in ``initOutputs``. 

1284 

1285 Returns 

1286 ------- 

1287 types: `PipelineDatasetTypes` 

1288 The dataset types used by this `Pipeline`. 

1289 

1290 Raises 

1291 ------ 

1292 ValueError 

1293 Raised if Tasks are inconsistent about which datasets are marked 

1294 prerequisite. This indicates that the Tasks cannot be run as part 

1295 of the same `Pipeline`. 

1296 """ 

1297 allInputs = NamedValueSet[DatasetType]() 

1298 allOutputs = NamedValueSet[DatasetType]() 

1299 allInitInputs = NamedValueSet[DatasetType]() 

1300 allInitOutputs = NamedValueSet[DatasetType]() 

1301 prerequisites = NamedValueSet[DatasetType]() 

1302 queryConstraints = NamedValueSet[DatasetType]() 

1303 byTask = dict() 

1304 if include_packages: 

1305 allInitOutputs.add( 

1306 DatasetType( 

1307 cls.packagesDatasetName, 

1308 registry.dimensions.empty, 

1309 storageClass=acc.PACKAGES_INIT_OUTPUT_STORAGE_CLASS, 

1310 ) 

1311 ) 

1312 # create a list of TaskDefs in case the input is a generator 

1313 pipeline = list(pipeline) 

1314 

1315 # collect all the output dataset types 

1316 typeStorageclassMap: dict[str, str] = {} 

1317 for taskDef in pipeline: 

1318 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1319 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1320 

1321 for taskDef in pipeline: 

1322 thisTask = TaskDatasetTypes.fromTaskDef( 

1323 taskDef, 

1324 registry=registry, 

1325 include_configs=include_configs, 

1326 storage_class_mapping=typeStorageclassMap, 

1327 ) 

1328 allInitInputs.update(thisTask.initInputs) 

1329 allInitOutputs.update(thisTask.initOutputs) 

1330 allInputs.update(thisTask.inputs) 

1331 # Inputs are query constraints if any task considers them a query 

1332 # constraint. 

1333 queryConstraints.update(thisTask.queryConstraints) 

1334 prerequisites.update(thisTask.prerequisites) 

1335 allOutputs.update(thisTask.outputs) 

1336 byTask[taskDef.label] = thisTask 

1337 if not prerequisites.isdisjoint(allInputs): 

1338 raise ValueError( 

1339 "{} marked as both prerequisites and regular inputs".format( 

1340 {dt.name for dt in allInputs & prerequisites} 

1341 ) 

1342 ) 

1343 if not prerequisites.isdisjoint(allOutputs): 

1344 raise ValueError( 

1345 "{} marked as both prerequisites and outputs".format( 

1346 {dt.name for dt in allOutputs & prerequisites} 

1347 ) 

1348 ) 

1349 # Make sure that components which are marked as inputs get treated as 

1350 # intermediates if there is an output which produces the composite 

1351 # containing the component 

1352 intermediateComponents = NamedValueSet[DatasetType]() 

1353 intermediateComposites = NamedValueSet[DatasetType]() 

1354 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1355 for dsType in allInputs: 

1356 # get the name of a possible component 

1357 name, component = dsType.nameAndComponent() 

1358 # if there is a component name, that means this is a component 

1359 # DatasetType, if there is an output which produces the parent of 

1360 # this component, treat this input as an intermediate 

1361 if component is not None: 

1362 # This needs to be in this if block, because someone might have 

1363 # a composite that is a pure input from existing data 

1364 if name in outputNameMapping: 

1365 intermediateComponents.add(dsType) 

1366 intermediateComposites.add(outputNameMapping[name]) 

1367 

1368 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1369 common = a.names & b.names 

1370 for name in common: 

1371 # Any compatibility is allowed. This function does not know 

1372 # if a dataset type is to be used for input or output. 

1373 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1374 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1375 

1376 checkConsistency(allInitInputs, allInitOutputs) 

1377 checkConsistency(allInputs, allOutputs) 

1378 checkConsistency(allInputs, intermediateComposites) 

1379 checkConsistency(allOutputs, intermediateComposites) 

1380 

1381 def frozen(s: Set[DatasetType]) -> NamedValueSet[DatasetType]: 

1382 assert isinstance(s, NamedValueSet) 

1383 s.freeze() 

1384 return s 

1385 

1386 inputs = frozen(allInputs - allOutputs - intermediateComponents) 

1387 

1388 return cls( 

1389 initInputs=frozen(allInitInputs - allInitOutputs), 

1390 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1391 initOutputs=frozen(allInitOutputs - allInitInputs), 

1392 inputs=inputs, 

1393 queryConstraints=frozen(queryConstraints & inputs), 

1394 # If there are storage class differences in inputs and outputs 

1395 # the intermediates have to choose priority. Here choose that 

1396 # inputs to tasks much match the requested storage class by 

1397 # applying the inputs over the top of the outputs. 

1398 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1399 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1400 prerequisites=frozen(prerequisites), 

1401 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1402 ) 

1403 

1404 @classmethod 

1405 def initOutputNames( 

1406 cls, 

1407 pipeline: Pipeline | Iterable[TaskDef], 

1408 *, 

1409 include_configs: bool = True, 

1410 include_packages: bool = True, 

1411 ) -> Iterator[str]: 

1412 """Return the names of dataset types ot task initOutputs, Configs, 

1413 and package versions for a pipeline. 

1414 

1415 Parameters 

1416 ---------- 

1417 pipeline: `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1418 A `Pipeline` instance or collection of `TaskDef` instances. 

1419 include_configs : `bool`, optional 

1420 If `True` (default) include config dataset types. 

1421 include_packages : `bool`, optional 

1422 If `True` (default) include the dataset type for package versions. 

1423 

1424 Yields 

1425 ------ 

1426 datasetTypeName : `str` 

1427 Name of the dataset type. 

1428 """ 

1429 if include_packages: 

1430 # Package versions dataset type 

1431 yield cls.packagesDatasetName 

1432 

1433 if isinstance(pipeline, Pipeline): 

1434 pipeline = pipeline.toExpandedPipeline() 

1435 

1436 for taskDef in pipeline: 

1437 # all task InitOutputs 

1438 for name in taskDef.connections.initOutputs: 

1439 attribute = getattr(taskDef.connections, name) 

1440 yield attribute.name 

1441 

1442 # config dataset name 

1443 if include_configs: 

1444 yield taskDef.configDatasetName