Coverage for python/lsst/pipe/base/pipeline.py: 27%

465 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-03 09:57 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Module defining Pipeline class and related methods. 

29""" 

30 

31from __future__ import annotations 

32 

33__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

34 

35import copy 

36import logging 

37import re 

38import urllib.parse 

39 

40# ------------------------------- 

41# Imports of standard modules -- 

42# ------------------------------- 

43from collections.abc import Callable, Generator, Iterable, Iterator, Mapping, Set 

44from dataclasses import dataclass 

45from types import MappingProxyType 

46from typing import TYPE_CHECKING, ClassVar, cast 

47 

48# ----------------------------- 

49# Imports for other modules -- 

50# ----------------------------- 

51from deprecated.sphinx import deprecated 

52from lsst.daf.butler import DataCoordinate, DatasetType, DimensionUniverse, NamedValueSet, Registry 

53from lsst.resources import ResourcePath, ResourcePathExpression 

54from lsst.utils import doImportType 

55from lsst.utils.introspection import get_full_type_name 

56 

57from . import automatic_connection_constants as acc 

58from . import pipeline_graph, pipelineIR 

59from ._instrument import Instrument as PipeBaseInstrument 

60from .config import PipelineTaskConfig 

61from .connections import PipelineTaskConnections, iterConnections 

62from .connectionTypes import Input 

63from .pipelineTask import PipelineTask 

64 

65if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

66 from lsst.obs.base import Instrument 

67 from lsst.pex.config import Config 

68 

69# ---------------------------------- 

70# Local non-exported definitions -- 

71# ---------------------------------- 

72 

73_LOG = logging.getLogger(__name__) 

74 

75# ------------------------ 

76# Exported definitions -- 

77# ------------------------ 

78 

79 

80@dataclass 

81class LabelSpecifier: 

82 """A structure to specify a subset of labels to load. 

83 

84 This structure may contain a set of labels to be used in subsetting a 

85 pipeline, or a beginning and end point. Beginning or end may be empty, in 

86 which case the range will be a half open interval. Unlike python iteration 

87 bounds, end bounds are *INCLUDED*. 

88 

89 There are multiple potential definitions of range-based slicing for graphs 

90 that are not a simple linear sequence. The definition used here is the 

91 intersection of the tasks downstream of ``begin`` and the tasks upstream of 

92 ``end``, i.e. tasks with no dependency relationship to a bounding task are 

93 not included. 

94 """ 

95 

96 labels: set[str] | None = None 

97 begin: str | None = None 

98 end: str | None = None 

99 

100 def __post_init__(self) -> None: 

101 if self.labels is not None and (self.begin or self.end): 

102 raise ValueError( 

103 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

104 ) 

105 

106 

107class TaskDef: 

108 """TaskDef is a collection of information about task needed by Pipeline. 

109 

110 The information includes task name, configuration object and optional 

111 task class. This class is just a collection of attributes and it exposes 

112 all of them so that attributes could potentially be modified in place 

113 (e.g. if configuration needs extra overrides). 

114 

115 Parameters 

116 ---------- 

117 taskName : `str`, optional 

118 The fully-qualified `PipelineTask` class name. If not provided, 

119 ``taskClass`` must be. 

120 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional 

121 Instance of the configuration class corresponding to this task class, 

122 usually with all overrides applied. This config will be frozen. If 

123 not provided, ``taskClass`` must be provided and 

124 ``taskClass.ConfigClass()`` will be used. 

125 taskClass : `type`, optional 

126 `PipelineTask` class object; if provided and ``taskName`` is as well, 

127 the caller guarantees that they are consistent. If not provided, 

128 ``taskName`` is used to import the type. 

129 label : `str`, optional 

130 Task label, usually a short string unique in a pipeline. If not 

131 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

132 be used. 

133 connections : `PipelineTaskConnections`, optional 

134 Object that describes the dataset types used by the task. If not 

135 provided, one will be constructed from the given configuration. If 

136 provided, it is assumed that ``config`` has already been validated 

137 and frozen. 

138 """ 

139 

140 def __init__( 

141 self, 

142 taskName: str | None = None, 

143 config: PipelineTaskConfig | None = None, 

144 taskClass: type[PipelineTask] | None = None, 

145 label: str | None = None, 

146 connections: PipelineTaskConnections | None = None, 

147 ): 

148 if taskName is None: 

149 if taskClass is None: 

150 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

151 taskName = get_full_type_name(taskClass) 

152 elif taskClass is None: 

153 taskClass = doImportType(taskName) 

154 if config is None: 

155 if taskClass is None: 

156 raise ValueError("`taskClass` must be provided if `config` is not.") 

157 config = taskClass.ConfigClass() 

158 if label is None: 

159 if taskClass is None: 

160 raise ValueError("`taskClass` must be provided if `label` is not.") 

161 label = taskClass._DefaultName 

162 self.taskName = taskName 

163 if connections is None: 

164 # If we don't have connections yet, assume the config hasn't been 

165 # validated yet. 

166 try: 

167 config.validate() 

168 except Exception: 

169 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

170 raise 

171 config.freeze() 

172 connections = config.connections.ConnectionsClass(config=config) 

173 self.config = config 

174 self.taskClass = taskClass 

175 self.label = label 

176 self.connections = connections 

177 

178 @property 

179 def configDatasetName(self) -> str: 

180 """Name of a dataset type for configuration of this task (`str`).""" 

181 return acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.label) 

182 

183 @property 

184 def metadataDatasetName(self) -> str: 

185 """Name of a dataset type for metadata of this task (`str`).""" 

186 return self.makeMetadataDatasetName(self.label) 

187 

188 @classmethod 

189 def makeMetadataDatasetName(cls, label: str) -> str: 

190 """Construct the name of the dataset type for metadata for a task. 

191 

192 Parameters 

193 ---------- 

194 label : `str` 

195 Label for the task within its pipeline. 

196 

197 Returns 

198 ------- 

199 name : `str` 

200 Name of the task's metadata dataset type. 

201 """ 

202 return acc.METADATA_OUTPUT_TEMPLATE.format(label=label) 

203 

204 @property 

205 def logOutputDatasetName(self) -> str | None: 

206 """Name of a dataset type for log output from this task, `None` if 

207 logs are not to be saved (`str`). 

208 """ 

209 if self.config.saveLogOutput: 

210 return acc.LOG_OUTPUT_TEMPLATE.format(label=self.label) 

211 else: 

212 return None 

213 

214 def __str__(self) -> str: 

215 rep = "TaskDef(" + self.taskName 

216 if self.label: 

217 rep += ", label=" + self.label 

218 rep += ")" 

219 return rep 

220 

221 def __eq__(self, other: object) -> bool: 

222 if not isinstance(other, TaskDef): 

223 return False 

224 # This does not consider equality of configs when determining equality 

225 # as config equality is a difficult thing to define. Should be updated 

226 # after DM-27847 

227 return self.taskClass == other.taskClass and self.label == other.label 

228 

229 def __hash__(self) -> int: 

230 return hash((self.taskClass, self.label)) 

231 

232 @classmethod 

233 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef: 

234 """Unpickle pickle. Custom callable for unpickling. 

235 

236 All arguments are forwarded directly to the constructor; this 

237 trampoline is only needed because ``__reduce__`` callables can't be 

238 called with keyword arguments. 

239 """ 

240 return cls(taskName=taskName, config=config, label=label) 

241 

242 def __reduce__(self) -> tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], tuple[str, Config, str]]: 

243 return (self._unreduce, (self.taskName, self.config, self.label)) 

244 

245 

246class Pipeline: 

247 """A `Pipeline` is a representation of a series of tasks to run, and the 

248 configuration for those tasks. 

249 

250 Parameters 

251 ---------- 

252 description : `str` 

253 A description of that this pipeline does. 

254 """ 

255 

256 PipelineSubsetCtrl = pipelineIR.PipelineSubsetCtrl 

257 

258 def __init__(self, description: str): 

259 pipeline_dict = {"description": description, "tasks": {}} 

260 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

261 

262 @classmethod 

263 def fromFile(cls, filename: str) -> Pipeline: 

264 """Load a pipeline defined in a pipeline yaml file. 

265 

266 Parameters 

267 ---------- 

268 filename : `str` 

269 A path that points to a pipeline defined in yaml format. This 

270 filename may also supply additional labels to be used in 

271 subsetting the loaded Pipeline. These labels are separated from 

272 the path by a ``#``, and may be specified as a comma separated 

273 list, or a range denoted as beginning..end. Beginning or end may 

274 be empty, in which case the range will be a half open interval. 

275 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

276 that range based selection is not well defined for pipelines that 

277 are not linear in nature, and correct behavior is not guaranteed, 

278 or may vary from run to run. 

279 

280 Returns 

281 ------- 

282 pipeline: `Pipeline` 

283 The pipeline loaded from specified location with appropriate (if 

284 any) subsetting. 

285 

286 Notes 

287 ----- 

288 This method attempts to prune any contracts that contain labels which 

289 are not in the declared subset of labels. This pruning is done using a 

290 string based matching due to the nature of contracts and may prune more 

291 than it should. 

292 """ 

293 return cls.from_uri(filename) 

294 

295 @classmethod 

296 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

297 """Load a pipeline defined in a pipeline yaml file at a location 

298 specified by a URI. 

299 

300 Parameters 

301 ---------- 

302 uri : convertible to `~lsst.resources.ResourcePath` 

303 If a string is supplied this should be a URI path that points to a 

304 pipeline defined in yaml format, either as a direct path to the 

305 yaml file, or as a directory containing a ``pipeline.yaml`` file 

306 the form used by `write_to_uri` with ``expand=True``). This uri may 

307 also supply additional labels to be used in subsetting the loaded 

308 `Pipeline`. These labels are separated from the path by a ``#``, 

309 and may be specified as a comma separated list, or a range denoted 

310 as beginning..end. Beginning or end may be empty, in which case the 

311 range will be a half open interval. Unlike python iteration bounds, 

312 end bounds are *INCLUDED*. Note that range based selection is not 

313 well defined for pipelines that are not linear in nature, and 

314 correct behavior is not guaranteed, or may vary from run to run. 

315 The same specifiers can be used with a 

316 `~lsst.resources.ResourcePath` object, by being the sole contents 

317 in the fragments attribute. 

318 

319 Returns 

320 ------- 

321 pipeline : `Pipeline` 

322 The pipeline loaded from specified location with appropriate (if 

323 any) subsetting. 

324 

325 Notes 

326 ----- 

327 This method attempts to prune any contracts that contain labels which 

328 are not in the declared subset of labels. This pruning is done using a 

329 string based matching due to the nature of contracts and may prune more 

330 than it should. 

331 """ 

332 # Split up the uri and any labels that were supplied 

333 uri, label_specifier = cls._parse_file_specifier(uri) 

334 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

335 

336 # If there are labels supplied, only keep those 

337 if label_specifier is not None: 

338 pipeline = pipeline.subsetFromLabels(label_specifier) 

339 return pipeline 

340 

341 def subsetFromLabels( 

342 self, 

343 labelSpecifier: LabelSpecifier, 

344 subsetCtrl: pipelineIR.PipelineSubsetCtrl = PipelineSubsetCtrl.DROP, 

345 ) -> Pipeline: 

346 """Subset a pipeline to contain only labels specified in 

347 ``labelSpecifier``. 

348 

349 Parameters 

350 ---------- 

351 labelSpecifier : `labelSpecifier` 

352 Object containing labels that describes how to subset a pipeline. 

353 subsetCtrl : `PipelineSubsetCtrl` 

354 Control object which decides how subsets with missing labels are 

355 handled. Setting to `PipelineSubsetCtrl.DROP` (the default) will 

356 cause any subsets that have labels which are not in the set of all 

357 task labels to be dropped. Setting to `PipelineSubsetCtrl.EDIT` 

358 will cause the subset to instead be edited to remove the 

359 nonexistent label. 

360 

361 Returns 

362 ------- 

363 pipeline : `Pipeline` 

364 A new pipeline object that is a subset of the old pipeline. 

365 

366 Raises 

367 ------ 

368 ValueError 

369 Raised if there is an issue with specified labels 

370 

371 Notes 

372 ----- 

373 This method attempts to prune any contracts that contain labels which 

374 are not in the declared subset of labels. This pruning is done using a 

375 string based matching due to the nature of contracts and may prune more 

376 than it should. 

377 """ 

378 # Labels supplied as a set 

379 if labelSpecifier.labels: 

380 labelSet = labelSpecifier.labels 

381 # Labels supplied as a range, first create a list of all the labels 

382 # in the pipeline sorted according to task dependency. Then only 

383 # keep labels that lie between the supplied bounds 

384 else: 

385 # Create a copy of the pipeline to use when assessing the label 

386 # ordering. Use a dict for fast searching while preserving order. 

387 # Remove contracts so they do not fail in the expansion step. This 

388 # is needed because a user may only configure the tasks they intend 

389 # to run, which may cause some contracts to fail if they will later 

390 # be dropped 

391 pipeline = copy.deepcopy(self) 

392 pipeline._pipelineIR.contracts = [] 

393 graph = pipeline.to_graph() 

394 

395 # Verify the bounds are in the labels 

396 if labelSpecifier.begin is not None and labelSpecifier.begin not in graph.tasks: 

397 raise ValueError( 

398 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition" 

399 ) 

400 if labelSpecifier.end is not None and labelSpecifier.end not in graph.tasks: 

401 raise ValueError( 

402 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

403 ) 

404 

405 labelSet = set(graph.tasks.between(labelSpecifier.begin, labelSpecifier.end)) 

406 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

407 

408 @staticmethod 

409 def _parse_file_specifier(uri: ResourcePathExpression) -> tuple[ResourcePath, LabelSpecifier | None]: 

410 """Split appart a uri and any possible label subsets""" 

411 if isinstance(uri, str): 

412 # This is to support legacy pipelines during transition 

413 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

414 if num_replace: 

415 raise ValueError( 

416 f"The pipeline file {uri} seems to use the legacy :" 

417 " to separate labels, please use # instead." 

418 ) 

419 if uri.count("#") > 1: 

420 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

421 # Everything else can be converted directly to ResourcePath. 

422 uri = ResourcePath(uri) 

423 label_subset = uri.fragment or None 

424 

425 specifier: LabelSpecifier | None 

426 if label_subset is not None: 

427 label_subset = urllib.parse.unquote(label_subset) 

428 args: dict[str, set[str] | str | None] 

429 # labels supplied as a list 

430 if "," in label_subset: 

431 if ".." in label_subset: 

432 raise ValueError( 

433 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

434 ) 

435 args = {"labels": set(label_subset.split(","))} 

436 # labels supplied as a range 

437 elif ".." in label_subset: 

438 # Try to de-structure the labelSubset, this will fail if more 

439 # than one range is specified 

440 begin, end, *rest = label_subset.split("..") 

441 if rest: 

442 raise ValueError("Only one range can be specified when loading a pipeline") 

443 args = {"begin": begin if begin else None, "end": end if end else None} 

444 # Assume anything else is a single label 

445 else: 

446 args = {"labels": {label_subset}} 

447 

448 # MyPy doesn't like how cavalier kwarg construction is with types. 

449 specifier = LabelSpecifier(**args) # type: ignore 

450 else: 

451 specifier = None 

452 

453 return uri, specifier 

454 

455 @classmethod 

456 def fromString(cls, pipeline_string: str) -> Pipeline: 

457 """Create a pipeline from string formatted as a pipeline document. 

458 

459 Parameters 

460 ---------- 

461 pipeline_string : `str` 

462 A string that is formatted according like a pipeline document. 

463 

464 Returns 

465 ------- 

466 pipeline: `Pipeline` 

467 The new pipeline. 

468 """ 

469 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

470 return pipeline 

471 

472 @classmethod 

473 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

474 """Create a pipeline from an already created `PipelineIR` object. 

475 

476 Parameters 

477 ---------- 

478 deserialized_pipeline : `PipelineIR` 

479 An already created pipeline intermediate representation object. 

480 

481 Returns 

482 ------- 

483 pipeline: `Pipeline` 

484 The new pipeline. 

485 """ 

486 pipeline = cls.__new__(cls) 

487 pipeline._pipelineIR = deserialized_pipeline 

488 return pipeline 

489 

490 @classmethod 

491 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

492 """Create a new pipeline by copying an already existing `Pipeline`. 

493 

494 Parameters 

495 ---------- 

496 pipeline : `Pipeline` 

497 An already created pipeline intermediate representation object. 

498 

499 Returns 

500 ------- 

501 pipeline: `Pipeline` 

502 The new pipeline. 

503 """ 

504 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

505 

506 def __str__(self) -> str: 

507 return str(self._pipelineIR) 

508 

509 def mergePipeline(self, pipeline: Pipeline) -> None: 

510 """Merge another in-memory `Pipeline` object into this one. 

511 

512 This merges another pipeline into this object, as if it were declared 

513 in the import block of the yaml definition of this pipeline. This 

514 modifies this pipeline in place. 

515 

516 Parameters 

517 ---------- 

518 pipeline : `Pipeline` 

519 The `Pipeline` object that is to be merged into this object. 

520 """ 

521 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,)) 

522 

523 def addLabelToSubset(self, subset: str, label: str) -> None: 

524 """Add a task label from the specified subset. 

525 

526 Parameters 

527 ---------- 

528 subset : `str` 

529 The labeled subset to modify. 

530 label : `str` 

531 The task label to add to the specified subset. 

532 

533 Raises 

534 ------ 

535 ValueError 

536 Raised if the specified subset does not exist within the pipeline. 

537 Raised if the specified label does not exist within the pipeline. 

538 """ 

539 if label not in self._pipelineIR.tasks: 

540 raise ValueError(f"Label {label} does not appear within the pipeline") 

541 if subset not in self._pipelineIR.labeled_subsets: 

542 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

543 self._pipelineIR.labeled_subsets[subset].subset.add(label) 

544 

545 def removeLabelFromSubset(self, subset: str, label: str) -> None: 

546 """Remove a task label from the specified subset. 

547 

548 Parameters 

549 ---------- 

550 subset : `str` 

551 The labeled subset to modify. 

552 label : `str` 

553 The task label to remove from the specified subset. 

554 

555 Raises 

556 ------ 

557 ValueError 

558 Raised if the specified subset does not exist in the pipeline. 

559 Raised if the specified label does not exist within the specified 

560 subset. 

561 """ 

562 if subset not in self._pipelineIR.labeled_subsets: 

563 raise ValueError(f"Subset {subset} does not appear within the pipeline") 

564 if label not in self._pipelineIR.labeled_subsets[subset].subset: 

565 raise ValueError(f"Label {label} does not appear within the pipeline") 

566 self._pipelineIR.labeled_subsets[subset].subset.remove(label) 

567 

568 def findSubsetsWithLabel(self, label: str) -> set[str]: 

569 """Find any subsets which may contain the specified label. 

570 

571 This function returns the name of subsets which return the specified 

572 label. May return an empty set if there are no subsets, or no subsets 

573 containing the specified label. 

574 

575 Parameters 

576 ---------- 

577 label : `str` 

578 The task label to use in membership check. 

579 

580 Returns 

581 ------- 

582 subsets : `set` of `str` 

583 Returns a set (possibly empty) of subsets names which contain the 

584 specified label. 

585 

586 Raises 

587 ------ 

588 ValueError 

589 Raised if the specified label does not exist within this pipeline. 

590 """ 

591 results = set() 

592 if label not in self._pipelineIR.tasks: 

593 raise ValueError(f"Label {label} does not appear within the pipeline") 

594 for subset in self._pipelineIR.labeled_subsets.values(): 

595 if label in subset.subset: 

596 results.add(subset.label) 

597 return results 

598 

599 @property 

600 def task_labels(self) -> Set[str]: 

601 """Labels of all tasks in the pipelines. 

602 

603 For simple pipelines with no imports, iteration over this set will 

604 match the order in which tasks are defined in the pipeline file. In 

605 all other cases the order is unspecified but deterministic. It is not 

606 dependency-ordered (use ``to_graph().tasks.keys()`` for that). 

607 """ 

608 return self._pipelineIR.tasks.keys() 

609 

610 @property 

611 def subsets(self) -> MappingProxyType[str, set]: 

612 """Returns a `MappingProxyType` where the keys are the labels of 

613 labeled subsets in the `Pipeline` and the values are the set of task 

614 labels contained within that subset. 

615 """ 

616 return MappingProxyType( 

617 {label: subsetIr.subset for label, subsetIr in self._pipelineIR.labeled_subsets.items()} 

618 ) 

619 

620 def addLabeledSubset(self, label: str, description: str, taskLabels: set[str]) -> None: 

621 """Add a new labeled subset to the `Pipeline`. 

622 

623 Parameters 

624 ---------- 

625 label : `str` 

626 The label to assign to the subset. 

627 description : `str` 

628 A description of what the subset is for. 

629 taskLabels : `set` [`str`] 

630 The set of task labels to be associated with the labeled subset. 

631 

632 Raises 

633 ------ 

634 ValueError 

635 Raised if label already exists in the `Pipeline`. 

636 Raised if a task label is not found within the `Pipeline`. 

637 """ 

638 if label in self._pipelineIR.labeled_subsets.keys(): 

639 raise ValueError(f"Subset label {label} is already found within the Pipeline") 

640 if extra := (taskLabels - self._pipelineIR.tasks.keys()): 

641 raise ValueError(f"Task labels {extra} were not found within the Pipeline") 

642 self._pipelineIR.labeled_subsets[label] = pipelineIR.LabeledSubset(label, taskLabels, description) 

643 

644 def removeLabeledSubset(self, label: str) -> None: 

645 """Remove a labeled subset from the `Pipeline`. 

646 

647 Parameters 

648 ---------- 

649 label : `str` 

650 The label of the subset to remove from the `Pipeline`. 

651 

652 Raises 

653 ------ 

654 ValueError 

655 Raised if the label is not found within the `Pipeline`. 

656 """ 

657 if label not in self._pipelineIR.labeled_subsets.keys(): 

658 raise ValueError(f"Subset label {label} was not found in the pipeline") 

659 self._pipelineIR.labeled_subsets.pop(label) 

660 

661 def addInstrument(self, instrument: Instrument | str) -> None: 

662 """Add an instrument to the pipeline, or replace an instrument that is 

663 already defined. 

664 

665 Parameters 

666 ---------- 

667 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

668 Either a derived class object of a `lsst.daf.butler.instrument` or 

669 a string corresponding to a fully qualified 

670 `lsst.daf.butler.instrument` name. 

671 """ 

672 if isinstance(instrument, str): 

673 pass 

674 else: 

675 # TODO: assume that this is a subclass of Instrument, no type 

676 # checking 

677 instrument = get_full_type_name(instrument) 

678 self._pipelineIR.instrument = instrument 

679 

680 def getInstrument(self) -> str | None: 

681 """Get the instrument from the pipeline. 

682 

683 Returns 

684 ------- 

685 instrument : `str`, or None 

686 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

687 name, or None if the pipeline does not have an instrument. 

688 """ 

689 return self._pipelineIR.instrument 

690 

691 def get_data_id(self, universe: DimensionUniverse) -> DataCoordinate: 

692 """Return a data ID with all dimension constraints embedded in the 

693 pipeline. 

694 

695 Parameters 

696 ---------- 

697 universe : `lsst.daf.butler.DimensionUniverse` 

698 Object that defines all dimensions. 

699 

700 Returns 

701 ------- 

702 data_id : `lsst.daf.butler.DataCoordinate` 

703 Data ID with all dimension constraints embedded in the 

704 pipeline. 

705 """ 

706 instrument_class_name = self._pipelineIR.instrument 

707 if instrument_class_name is not None: 

708 instrument_class = cast(PipeBaseInstrument, doImportType(instrument_class_name)) 

709 if instrument_class is not None: 

710 return DataCoordinate.standardize(instrument=instrument_class.getName(), universe=universe) 

711 return DataCoordinate.make_empty(universe) 

712 

713 def addTask(self, task: type[PipelineTask] | str, label: str) -> None: 

714 """Add a new task to the pipeline, or replace a task that is already 

715 associated with the supplied label. 

716 

717 Parameters 

718 ---------- 

719 task : `PipelineTask` or `str` 

720 Either a derived class object of a `PipelineTask` or a string 

721 corresponding to a fully qualified `PipelineTask` name. 

722 label : `str` 

723 A label that is used to identify the `PipelineTask` being added. 

724 """ 

725 if isinstance(task, str): 

726 taskName = task 

727 elif issubclass(task, PipelineTask): 

728 taskName = get_full_type_name(task) 

729 else: 

730 raise ValueError( 

731 "task must be either a child class of PipelineTask or a string containing" 

732 " a fully qualified name to one" 

733 ) 

734 if not label: 

735 # in some cases (with command line-generated pipeline) tasks can 

736 # be defined without label which is not acceptable, use task 

737 # _DefaultName in that case 

738 if isinstance(task, str): 

739 task_class = cast(PipelineTask, doImportType(task)) 

740 label = task_class._DefaultName 

741 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

742 

743 def removeTask(self, label: str) -> None: 

744 """Remove a task from the pipeline. 

745 

746 Parameters 

747 ---------- 

748 label : `str` 

749 The label used to identify the task that is to be removed. 

750 

751 Raises 

752 ------ 

753 KeyError 

754 If no task with that label exists in the pipeline. 

755 """ 

756 self._pipelineIR.tasks.pop(label) 

757 

758 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

759 """Apply single config override. 

760 

761 Parameters 

762 ---------- 

763 label : `str` 

764 Label of the task. 

765 key : `str` 

766 Fully-qualified field name. 

767 value : object 

768 Value to be given to a field. 

769 """ 

770 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

771 

772 def addConfigFile(self, label: str, filename: str) -> None: 

773 """Add overrides from a specified file. 

774 

775 Parameters 

776 ---------- 

777 label : `str` 

778 The label used to identify the task associated with config to 

779 modify. 

780 filename : `str` 

781 Path to the override file. 

782 """ 

783 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

784 

785 def addConfigPython(self, label: str, pythonString: str) -> None: 

786 """Add Overrides by running a snippet of python code against a config. 

787 

788 Parameters 

789 ---------- 

790 label : `str` 

791 The label used to identity the task associated with config to 

792 modify. 

793 pythonString : `str` 

794 A string which is valid python code to be executed. This is done 

795 with config as the only local accessible value. 

796 """ 

797 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

798 

799 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

800 if label == "parameters": 

801 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

802 if newConfig.file: 

803 raise ValueError("Setting parameters section with config file is not supported") 

804 if newConfig.python: 

805 raise ValueError("Setting parameters section using python block in unsupported") 

806 return 

807 if label not in self._pipelineIR.tasks: 

808 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

809 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

810 

811 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

812 """Write the pipeline to a file or directory. 

813 

814 Parameters 

815 ---------- 

816 uri : convertible to `~lsst.resources.ResourcePath` 

817 URI to write to; may have any scheme with 

818 `~lsst.resources.ResourcePath` write support or no scheme for a 

819 local file/directory. Should have a ``.yaml`` extension. 

820 """ 

821 self._pipelineIR.write_to_uri(uri) 

822 

823 def to_graph(self, registry: Registry | None = None) -> pipeline_graph.PipelineGraph: 

824 """Construct a pipeline graph from this pipeline. 

825 

826 Constructing a graph applies all configuration overrides, freezes all 

827 configuration, checks all contracts, and checks for dataset type 

828 consistency between tasks (as much as possible without access to a data 

829 repository). It cannot be reversed. 

830 

831 Parameters 

832 ---------- 

833 registry : `lsst.daf.butler.Registry`, optional 

834 Data repository client. If provided, the graph's dataset types 

835 and dimensions will be resolved (see `PipelineGraph.resolve`). 

836 

837 Returns 

838 ------- 

839 graph : `pipeline_graph.PipelineGraph` 

840 Representation of the pipeline as a graph. 

841 """ 

842 instrument_class_name = self._pipelineIR.instrument 

843 data_id = {} 

844 if instrument_class_name is not None: 

845 instrument_class: type[Instrument] = doImportType(instrument_class_name) 

846 if instrument_class is not None: 

847 data_id["instrument"] = instrument_class.getName() 

848 graph = pipeline_graph.PipelineGraph(data_id=data_id) 

849 graph.description = self._pipelineIR.description 

850 for label in self._pipelineIR.tasks: 

851 self._add_task_to_graph(label, graph) 

852 if self._pipelineIR.contracts is not None: 

853 label_to_config = {x.label: x.config for x in graph.tasks.values()} 

854 for contract in self._pipelineIR.contracts: 

855 # execute this in its own line so it can raise a good error 

856 # message if there was problems with the eval 

857 success = eval(contract.contract, None, label_to_config) 

858 if not success: 

859 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

860 raise pipelineIR.ContractError( 

861 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

862 ) 

863 for label, subset in self._pipelineIR.labeled_subsets.items(): 

864 graph.add_task_subset( 

865 label, subset.subset, subset.description if subset.description is not None else "" 

866 ) 

867 graph.sort() 

868 if registry is not None: 

869 graph.resolve(registry) 

870 return graph 

871 

872 # TODO: remove on DM-40443. 

873 @deprecated( 

874 reason="Deprecated in favor of to_graph; will be removed after v27.", 

875 version="v27.0", 

876 category=FutureWarning, 

877 ) 

878 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

879 r"""Return a generator of `TaskDef`\s which can be used to create 

880 quantum graphs. 

881 

882 Returns 

883 ------- 

884 generator : generator of `TaskDef` 

885 The generator returned will be the sorted iterator of tasks which 

886 are to be used in constructing a quantum graph. 

887 

888 Raises 

889 ------ 

890 NotImplementedError 

891 If a dataId is supplied in a config block. This is in place for 

892 future use. 

893 """ 

894 yield from self.to_graph()._iter_task_defs() 

895 

896 def _add_task_to_graph(self, label: str, graph: pipeline_graph.PipelineGraph) -> None: 

897 """Add a single task from this pipeline to a pipeline graph that is 

898 under construction. 

899 

900 Parameters 

901 ---------- 

902 label : `str` 

903 Label for the task to be added. 

904 graph : `pipeline_graph.PipelineGraph` 

905 Graph to add the task to. 

906 """ 

907 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

908 raise NameError(f"Label {label} does not appear in this pipeline") 

909 taskClass: type[PipelineTask] = doImportType(taskIR.klass) 

910 config = taskClass.ConfigClass() 

911 instrument: PipeBaseInstrument | None = None 

912 if (instrumentName := self._pipelineIR.instrument) is not None: 

913 instrument_cls: type = doImportType(instrumentName) 

914 instrument = instrument_cls() 

915 config.applyConfigOverrides( 

916 instrument, 

917 getattr(taskClass, "_DefaultName", ""), 

918 taskIR.config, 

919 self._pipelineIR.parameters, 

920 label, 

921 ) 

922 graph.add_task(label, taskClass, config) 

923 

924 # TODO: remove on DM-40443. 

925 @deprecated( 

926 reason="Deprecated in favor of to_graph; will be removed after v27.", 

927 version="v27.0", 

928 category=FutureWarning, 

929 ) 

930 def __iter__(self) -> Generator[TaskDef, None, None]: 

931 return self.toExpandedPipeline() 

932 

933 # TODO: remove on DM-40443. 

934 @deprecated( 

935 reason="Deprecated in favor of to_graph; will be removed after v27.", 

936 version="v27.0", 

937 category=FutureWarning, 

938 ) 

939 def __getitem__(self, item: str) -> TaskDef: 

940 # Making a whole graph and then making a TaskDef from that is pretty 

941 # backwards, but I'm hoping to deprecate this method shortly in favor 

942 # of making the graph explicitly and working with its node objects. 

943 graph = pipeline_graph.PipelineGraph() 

944 self._add_task_to_graph(item, graph) 

945 (result,) = graph._iter_task_defs() 

946 return result 

947 

948 def __len__(self) -> int: 

949 return len(self._pipelineIR.tasks) 

950 

951 def __eq__(self, other: object) -> bool: 

952 if not isinstance(other, Pipeline): 

953 return False 

954 elif self._pipelineIR == other._pipelineIR: 

955 # Shortcut: if the IR is the same, the expanded pipeline must be 

956 # the same as well. But the converse is not true. 

957 return True 

958 else: 

959 # Compare as much as we can (task classes and their edges). 

960 if self.to_graph().diff_tasks(other.to_graph()): 

961 return False 

962 # After DM-27847, we should compare configuration here. 

963 raise NotImplementedError( 

964 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847." 

965 ) 

966 

967 

968# TODO: remove on DM-40443. 

969@deprecated( 

970 reason="TaskDatasetTypes has been replaced by PipelineGraph, and will be removed after v27.", 

971 version="v27.0", 

972 category=FutureWarning, 

973) 

974@dataclass(frozen=True) 

975class TaskDatasetTypes: 

976 """An immutable struct that extracts and classifies the dataset types used 

977 by a `PipelineTask`. 

978 """ 

979 

980 initInputs: NamedValueSet[DatasetType] 

981 """Dataset types that are needed as inputs in order to construct this Task. 

982 

983 Task-level `initInputs` may be classified as either 

984 `~PipelineDatasetTypes.initInputs` or 

985 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

986 """ 

987 

988 initOutputs: NamedValueSet[DatasetType] 

989 """Dataset types that may be written after constructing this Task. 

990 

991 Task-level `initOutputs` may be classified as either 

992 `~PipelineDatasetTypes.initOutputs` or 

993 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

994 """ 

995 

996 inputs: NamedValueSet[DatasetType] 

997 """Dataset types that are regular inputs to this Task. 

998 

999 If an input dataset needed for a Quantum cannot be found in the input 

1000 collection(s) or produced by another Task in the Pipeline, that Quantum 

1001 (and all dependent Quanta) will not be produced. 

1002 

1003 Task-level `inputs` may be classified as either 

1004 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

1005 at the Pipeline level. 

1006 """ 

1007 

1008 queryConstraints: NamedValueSet[DatasetType] 

1009 """Regular inputs that should not be used as constraints on the initial 

1010 QuantumGraph generation data ID query, according to their tasks 

1011 (`NamedValueSet`). 

1012 """ 

1013 

1014 prerequisites: NamedValueSet[DatasetType] 

1015 """Dataset types that are prerequisite inputs to this Task. 

1016 

1017 Prerequisite inputs must exist in the input collection(s) before the 

1018 pipeline is run, but do not constrain the graph - if a prerequisite is 

1019 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1020 

1021 Prerequisite inputs are not resolved until the second stage of 

1022 QuantumGraph generation. 

1023 """ 

1024 

1025 outputs: NamedValueSet[DatasetType] 

1026 """Dataset types that are produced by this Task. 

1027 

1028 Task-level `outputs` may be classified as either 

1029 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

1030 at the Pipeline level. 

1031 """ 

1032 

1033 @classmethod 

1034 def fromTaskDef( 

1035 cls, 

1036 taskDef: TaskDef, 

1037 *, 

1038 registry: Registry, 

1039 include_configs: bool = True, 

1040 storage_class_mapping: Mapping[str, str] | None = None, 

1041 ) -> TaskDatasetTypes: 

1042 """Extract and classify the dataset types from a single `PipelineTask`. 

1043 

1044 Parameters 

1045 ---------- 

1046 taskDef : `TaskDef` 

1047 An instance of a `TaskDef` class for a particular `PipelineTask`. 

1048 registry : `Registry` 

1049 Registry used to construct normalized 

1050 `~lsst.daf.butler.DatasetType` objects and retrieve those that are 

1051 incomplete. 

1052 include_configs : `bool`, optional 

1053 If `True` (default) include config dataset types as 

1054 ``initOutputs``. 

1055 storage_class_mapping : `~collections.abc.Mapping` of `str` to \ 

1056 `~lsst.daf.butler.StorageClass`, optional 

1057 If a taskdef contains a component dataset type that is unknown 

1058 to the registry, its parent `~lsst.daf.butler.StorageClass` will 

1059 be looked up in this mapping if it is supplied. If the mapping does 

1060 not contain the composite dataset type, or the mapping is not 

1061 supplied an exception will be raised. 

1062 

1063 Returns 

1064 ------- 

1065 types: `TaskDatasetTypes` 

1066 The dataset types used by this task. 

1067 

1068 Raises 

1069 ------ 

1070 ValueError 

1071 Raised if dataset type connection definition differs from 

1072 registry definition. 

1073 LookupError 

1074 Raised if component parent StorageClass could not be determined 

1075 and storage_class_mapping does not contain the composite type, or 

1076 is set to None. 

1077 """ 

1078 

1079 def makeDatasetTypesSet( 

1080 connectionType: str, 

1081 is_input: bool, 

1082 freeze: bool = True, 

1083 ) -> NamedValueSet[DatasetType]: 

1084 """Construct a set of true `~lsst.daf.butler.DatasetType` objects. 

1085 

1086 Parameters 

1087 ---------- 

1088 connectionType : `str` 

1089 Name of the connection type to produce a set for, corresponds 

1090 to an attribute of type `list` on the connection class 

1091 instance. 

1092 is_input : `bool` 

1093 These are input dataset types, else they are output dataset 

1094 types. 

1095 freeze : `bool`, optional 

1096 If `True`, call `NamedValueSet.freeze` on the object returned. 

1097 

1098 Returns 

1099 ------- 

1100 datasetTypes : `NamedValueSet` 

1101 A set of all datasetTypes which correspond to the input 

1102 connection type specified in the connection class of this 

1103 `PipelineTask`. 

1104 

1105 Raises 

1106 ------ 

1107 ValueError 

1108 Raised if dataset type connection definition differs from 

1109 registry definition. 

1110 LookupError 

1111 Raised if component parent StorageClass could not be determined 

1112 and storage_class_mapping does not contain the composite type, 

1113 or is set to None. 

1114 

1115 Notes 

1116 ----- 

1117 This function is a closure over the variables ``registry`` and 

1118 ``taskDef``, and ``storage_class_mapping``. 

1119 """ 

1120 datasetTypes = NamedValueSet[DatasetType]() 

1121 for c in iterConnections(taskDef.connections, connectionType): 

1122 dimensions = set(getattr(c, "dimensions", set())) 

1123 if "skypix" in dimensions: 

1124 try: 

1125 datasetType = registry.getDatasetType(c.name) 

1126 except LookupError as err: 

1127 raise LookupError( 

1128 f"DatasetType '{c.name}' referenced by " 

1129 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

1130 "placeholder, but does not already exist in the registry. " 

1131 "Note that reference catalog names are now used as the dataset " 

1132 "type name instead of 'ref_cat'." 

1133 ) from err 

1134 rest1 = set(registry.dimensions.conform(dimensions - {"skypix"}).names) 

1135 rest2 = datasetType.dimensions.names - datasetType.dimensions.skypix.names 

1136 if rest1 != rest2: 

1137 raise ValueError( 

1138 f"Non-skypix dimensions for dataset type {c.name} declared in " 

1139 f"connections ({rest1}) are inconsistent with those in " 

1140 f"registry's version of this dataset ({rest2})." 

1141 ) 

1142 else: 

1143 # Component dataset types are not explicitly in the 

1144 # registry. This complicates consistency checks with 

1145 # registry and requires we work out the composite storage 

1146 # class. 

1147 registryDatasetType = None 

1148 try: 

1149 registryDatasetType = registry.getDatasetType(c.name) 

1150 except KeyError: 

1151 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

1152 if componentName: 

1153 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

1154 raise LookupError( 

1155 "Component parent class cannot be determined, and " 

1156 "composite name was not in storage class mapping, or no " 

1157 "storage_class_mapping was supplied" 

1158 ) from None 

1159 else: 

1160 parentStorageClass = storage_class_mapping[compositeName] 

1161 else: 

1162 parentStorageClass = None 

1163 datasetType = c.makeDatasetType( 

1164 registry.dimensions, parentStorageClass=parentStorageClass 

1165 ) 

1166 registryDatasetType = datasetType 

1167 else: 

1168 datasetType = c.makeDatasetType( 

1169 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

1170 ) 

1171 

1172 if registryDatasetType and datasetType != registryDatasetType: 

1173 # The dataset types differ but first check to see if 

1174 # they are compatible before raising. 

1175 if is_input: 

1176 # This DatasetType must be compatible on get. 

1177 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

1178 else: 

1179 # Has to be able to be converted to expect type 

1180 # on put. 

1181 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

1182 if is_compatible: 

1183 # For inputs we want the pipeline to use the 

1184 # pipeline definition, for outputs it should use 

1185 # the registry definition. 

1186 if not is_input: 

1187 datasetType = registryDatasetType 

1188 _LOG.debug( 

1189 "Dataset types differ (task %s != registry %s) but are compatible" 

1190 " for %s in %s.", 

1191 datasetType, 

1192 registryDatasetType, 

1193 "input" if is_input else "output", 

1194 taskDef.label, 

1195 ) 

1196 else: 

1197 try: 

1198 # Explicitly check for storage class just to 

1199 # make more specific message. 

1200 _ = datasetType.storageClass 

1201 except KeyError: 

1202 raise ValueError( 

1203 "Storage class does not exist for supplied dataset type " 

1204 f"{datasetType} for {taskDef.label}." 

1205 ) from None 

1206 raise ValueError( 

1207 f"Supplied dataset type ({datasetType}) inconsistent with " 

1208 f"registry definition ({registryDatasetType}) " 

1209 f"for {taskDef.label}." 

1210 ) 

1211 datasetTypes.add(datasetType) 

1212 if freeze: 

1213 datasetTypes.freeze() 

1214 return datasetTypes 

1215 

1216 # optionally add initOutput dataset for config 

1217 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

1218 if include_configs: 

1219 initOutputs.add( 

1220 DatasetType( 

1221 taskDef.configDatasetName, 

1222 registry.dimensions.empty, 

1223 storageClass=acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS, 

1224 ) 

1225 ) 

1226 initOutputs.freeze() 

1227 

1228 # optionally add output dataset for metadata 

1229 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

1230 

1231 # Metadata is supposed to be of the TaskMetadata type, its dimensions 

1232 # correspond to a task quantum. 

1233 dimensions = registry.dimensions.conform(taskDef.connections.dimensions) 

1234 

1235 # Allow the storage class definition to be read from the existing 

1236 # dataset type definition if present. 

1237 try: 

1238 current = registry.getDatasetType(taskDef.metadataDatasetName) 

1239 except KeyError: 

1240 # No previous definition so use the default. 

1241 storageClass = acc.METADATA_OUTPUT_STORAGE_CLASS 

1242 else: 

1243 storageClass = current.storageClass.name 

1244 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

1245 

1246 if taskDef.logOutputDatasetName is not None: 

1247 # Log output dimensions correspond to a task quantum. 

1248 dimensions = registry.dimensions.conform(taskDef.connections.dimensions) 

1249 outputs.update( 

1250 { 

1251 DatasetType( 

1252 taskDef.logOutputDatasetName, 

1253 dimensions, 

1254 acc.LOG_OUTPUT_STORAGE_CLASS, 

1255 ) 

1256 } 

1257 ) 

1258 

1259 outputs.freeze() 

1260 

1261 inputs = makeDatasetTypesSet("inputs", is_input=True) 

1262 queryConstraints = NamedValueSet( 

1263 inputs[c.name] 

1264 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs")) 

1265 if not c.deferGraphConstraint 

1266 ) 

1267 

1268 return cls( 

1269 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

1270 initOutputs=initOutputs, 

1271 inputs=inputs, 

1272 queryConstraints=queryConstraints, 

1273 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

1274 outputs=outputs, 

1275 ) 

1276 

1277 

1278# TODO: remove on DM-40443. 

1279@deprecated( 

1280 reason="PipelineDatasetTypes has been replaced by PipelineGraph, and will be removed after v27.", 

1281 version="v27.0", 

1282 category=FutureWarning, 

1283) 

1284@dataclass(frozen=True) 

1285class PipelineDatasetTypes: 

1286 """An immutable struct that classifies the dataset types used in a 

1287 `Pipeline`. 

1288 """ 

1289 

1290 packagesDatasetName: ClassVar[str] = acc.PACKAGES_INIT_OUTPUT_NAME 

1291 """Name of a dataset type used to save package versions. 

1292 """ 

1293 

1294 initInputs: NamedValueSet[DatasetType] 

1295 """Dataset types that are needed as inputs in order to construct the Tasks 

1296 in this Pipeline. 

1297 

1298 This does not include dataset types that are produced when constructing 

1299 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1300 """ 

1301 

1302 initOutputs: NamedValueSet[DatasetType] 

1303 """Dataset types that may be written after constructing the Tasks in this 

1304 Pipeline. 

1305 

1306 This does not include dataset types that are also used as inputs when 

1307 constructing other Tasks in the Pipeline (these are classified as 

1308 `initIntermediates`). 

1309 """ 

1310 

1311 initIntermediates: NamedValueSet[DatasetType] 

1312 """Dataset types that are both used when constructing one or more Tasks 

1313 in the Pipeline and produced as a side-effect of constructing another 

1314 Task in the Pipeline. 

1315 """ 

1316 

1317 inputs: NamedValueSet[DatasetType] 

1318 """Dataset types that are regular inputs for the full pipeline. 

1319 

1320 If an input dataset needed for a Quantum cannot be found in the input 

1321 collection(s), that Quantum (and all dependent Quanta) will not be 

1322 produced. 

1323 """ 

1324 

1325 queryConstraints: NamedValueSet[DatasetType] 

1326 """Regular inputs that should be used as constraints on the initial 

1327 QuantumGraph generation data ID query, according to their tasks 

1328 (`NamedValueSet`). 

1329 """ 

1330 

1331 prerequisites: NamedValueSet[DatasetType] 

1332 """Dataset types that are prerequisite inputs for the full Pipeline. 

1333 

1334 Prerequisite inputs must exist in the input collection(s) before the 

1335 pipeline is run, but do not constrain the graph - if a prerequisite is 

1336 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1337 

1338 Prerequisite inputs are not resolved until the second stage of 

1339 QuantumGraph generation. 

1340 """ 

1341 

1342 intermediates: NamedValueSet[DatasetType] 

1343 """Dataset types that are output by one Task in the Pipeline and consumed 

1344 as inputs by one or more other Tasks in the Pipeline. 

1345 """ 

1346 

1347 outputs: NamedValueSet[DatasetType] 

1348 """Dataset types that are output by a Task in the Pipeline and not consumed 

1349 by any other Task in the Pipeline. 

1350 """ 

1351 

1352 byTask: Mapping[str, TaskDatasetTypes] 

1353 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1354 

1355 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1356 neither has been modified since the dataset types were extracted, of 

1357 course). 

1358 """ 

1359 

1360 @classmethod 

1361 def fromPipeline( 

1362 cls, 

1363 pipeline: Pipeline | Iterable[TaskDef], 

1364 *, 

1365 registry: Registry, 

1366 include_configs: bool = True, 

1367 include_packages: bool = True, 

1368 ) -> PipelineDatasetTypes: 

1369 """Extract and classify the dataset types from all tasks in a 

1370 `Pipeline`. 

1371 

1372 Parameters 

1373 ---------- 

1374 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1375 A collection of tasks that can be run together. 

1376 registry : `Registry` 

1377 Registry used to construct normalized 

1378 `~lsst.daf.butler.DatasetType` objects and retrieve those that are 

1379 incomplete. 

1380 include_configs : `bool`, optional 

1381 If `True` (default) include config dataset types as 

1382 ``initOutputs``. 

1383 include_packages : `bool`, optional 

1384 If `True` (default) include the dataset type for software package 

1385 versions in ``initOutputs``. 

1386 

1387 Returns 

1388 ------- 

1389 types: `PipelineDatasetTypes` 

1390 The dataset types used by this `Pipeline`. 

1391 

1392 Raises 

1393 ------ 

1394 ValueError 

1395 Raised if Tasks are inconsistent about which datasets are marked 

1396 prerequisite. This indicates that the Tasks cannot be run as part 

1397 of the same `Pipeline`. 

1398 """ 

1399 allInputs = NamedValueSet[DatasetType]() 

1400 allOutputs = NamedValueSet[DatasetType]() 

1401 allInitInputs = NamedValueSet[DatasetType]() 

1402 allInitOutputs = NamedValueSet[DatasetType]() 

1403 prerequisites = NamedValueSet[DatasetType]() 

1404 queryConstraints = NamedValueSet[DatasetType]() 

1405 byTask = dict() 

1406 if include_packages: 

1407 allInitOutputs.add( 

1408 DatasetType( 

1409 cls.packagesDatasetName, 

1410 registry.dimensions.empty, 

1411 storageClass=acc.PACKAGES_INIT_OUTPUT_STORAGE_CLASS, 

1412 ) 

1413 ) 

1414 # create a list of TaskDefs in case the input is a generator 

1415 pipeline = list(pipeline) 

1416 

1417 # collect all the output dataset types 

1418 typeStorageclassMap: dict[str, str] = {} 

1419 for taskDef in pipeline: 

1420 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1421 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1422 

1423 for taskDef in pipeline: 

1424 thisTask = TaskDatasetTypes.fromTaskDef( 

1425 taskDef, 

1426 registry=registry, 

1427 include_configs=include_configs, 

1428 storage_class_mapping=typeStorageclassMap, 

1429 ) 

1430 allInitInputs.update(thisTask.initInputs) 

1431 allInitOutputs.update(thisTask.initOutputs) 

1432 allInputs.update(thisTask.inputs) 

1433 # Inputs are query constraints if any task considers them a query 

1434 # constraint. 

1435 queryConstraints.update(thisTask.queryConstraints) 

1436 prerequisites.update(thisTask.prerequisites) 

1437 allOutputs.update(thisTask.outputs) 

1438 byTask[taskDef.label] = thisTask 

1439 if not prerequisites.isdisjoint(allInputs): 

1440 raise ValueError( 

1441 "{} marked as both prerequisites and regular inputs".format( 

1442 {dt.name for dt in allInputs & prerequisites} 

1443 ) 

1444 ) 

1445 if not prerequisites.isdisjoint(allOutputs): 

1446 raise ValueError( 

1447 "{} marked as both prerequisites and outputs".format( 

1448 {dt.name for dt in allOutputs & prerequisites} 

1449 ) 

1450 ) 

1451 # Make sure that components which are marked as inputs get treated as 

1452 # intermediates if there is an output which produces the composite 

1453 # containing the component 

1454 intermediateComponents = NamedValueSet[DatasetType]() 

1455 intermediateComposites = NamedValueSet[DatasetType]() 

1456 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1457 for dsType in allInputs: 

1458 # get the name of a possible component 

1459 name, component = dsType.nameAndComponent() 

1460 # if there is a component name, that means this is a component 

1461 # DatasetType, if there is an output which produces the parent of 

1462 # this component, treat this input as an intermediate 

1463 if component is not None: 

1464 # This needs to be in this if block, because someone might have 

1465 # a composite that is a pure input from existing data 

1466 if name in outputNameMapping: 

1467 intermediateComponents.add(dsType) 

1468 intermediateComposites.add(outputNameMapping[name]) 

1469 

1470 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1471 common = a.names & b.names 

1472 for name in common: 

1473 # Any compatibility is allowed. This function does not know 

1474 # if a dataset type is to be used for input or output. 

1475 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1476 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1477 

1478 checkConsistency(allInitInputs, allInitOutputs) 

1479 checkConsistency(allInputs, allOutputs) 

1480 checkConsistency(allInputs, intermediateComposites) 

1481 checkConsistency(allOutputs, intermediateComposites) 

1482 

1483 def frozen(s: Set[DatasetType]) -> NamedValueSet[DatasetType]: 

1484 assert isinstance(s, NamedValueSet) 

1485 s.freeze() 

1486 return s 

1487 

1488 inputs = frozen(allInputs - allOutputs - intermediateComponents) 

1489 

1490 return cls( 

1491 initInputs=frozen(allInitInputs - allInitOutputs), 

1492 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1493 initOutputs=frozen(allInitOutputs - allInitInputs), 

1494 inputs=inputs, 

1495 queryConstraints=frozen(queryConstraints & inputs), 

1496 # If there are storage class differences in inputs and outputs 

1497 # the intermediates have to choose priority. Here choose that 

1498 # inputs to tasks much match the requested storage class by 

1499 # applying the inputs over the top of the outputs. 

1500 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1501 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1502 prerequisites=frozen(prerequisites), 

1503 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1504 ) 

1505 

1506 @classmethod 

1507 def initOutputNames( 

1508 cls, 

1509 pipeline: Pipeline | Iterable[TaskDef], 

1510 *, 

1511 include_configs: bool = True, 

1512 include_packages: bool = True, 

1513 ) -> Iterator[str]: 

1514 """Return the names of dataset types ot task initOutputs, Configs, 

1515 and package versions for a pipeline. 

1516 

1517 Parameters 

1518 ---------- 

1519 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1520 A `Pipeline` instance or collection of `TaskDef` instances. 

1521 include_configs : `bool`, optional 

1522 If `True` (default) include config dataset types. 

1523 include_packages : `bool`, optional 

1524 If `True` (default) include the dataset type for package versions. 

1525 

1526 Yields 

1527 ------ 

1528 datasetTypeName : `str` 

1529 Name of the dataset type. 

1530 """ 

1531 if include_packages: 

1532 # Package versions dataset type 

1533 yield cls.packagesDatasetName 

1534 

1535 if isinstance(pipeline, Pipeline): 

1536 pipeline = pipeline.toExpandedPipeline() 

1537 

1538 for taskDef in pipeline: 

1539 # all task InitOutputs 

1540 for name in taskDef.connections.initOutputs: 

1541 attribute = getattr(taskDef.connections, name) 

1542 yield attribute.name 

1543 

1544 # config dataset name 

1545 if include_configs: 

1546 yield taskDef.configDatasetName