Coverage for python/lsst/pipe/base/pipeline.py: 18%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

396 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import os 

31import re 

32import urllib.parse 

33import warnings 

34 

35# ------------------------------- 

36# Imports of standard modules -- 

37# ------------------------------- 

38from dataclasses import dataclass 

39from types import MappingProxyType 

40from typing import ( 

41 TYPE_CHECKING, 

42 AbstractSet, 

43 ClassVar, 

44 Dict, 

45 Generator, 

46 Iterable, 

47 Iterator, 

48 Mapping, 

49 Optional, 

50 Set, 

51 Tuple, 

52 Type, 

53 Union, 

54) 

55 

56# ----------------------------- 

57# Imports for other modules -- 

58from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_full_type_name 

62 

63from . import pipelineIR, pipeTools 

64from ._task_metadata import TaskMetadata 

65from .configOverrides import ConfigOverrides 

66from .connections import iterConnections 

67from .pipelineTask import PipelineTask 

68from .task import _TASK_METADATA_TYPE 

69 

70if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 70 ↛ 71line 70 didn't jump to line 71, because the condition on line 70 was never true

71 from lsst.obs.base import Instrument 

72 from lsst.pex.config import Config 

73 

74# ---------------------------------- 

75# Local non-exported definitions -- 

76# ---------------------------------- 

77 

78_LOG = logging.getLogger(__name__) 

79 

80# ------------------------ 

81# Exported definitions -- 

82# ------------------------ 

83 

84 

85@dataclass 

86class LabelSpecifier: 

87 """A structure to specify a subset of labels to load 

88 

89 This structure may contain a set of labels to be used in subsetting a 

90 pipeline, or a beginning and end point. Beginning or end may be empty, 

91 in which case the range will be a half open interval. Unlike python 

92 iteration bounds, end bounds are *INCLUDED*. Note that range based 

93 selection is not well defined for pipelines that are not linear in nature, 

94 and correct behavior is not guaranteed, or may vary from run to run. 

95 """ 

96 

97 labels: Optional[Set[str]] = None 

98 begin: Optional[str] = None 

99 end: Optional[str] = None 

100 

101 def __post_init__(self) -> None: 

102 if self.labels is not None and (self.begin or self.end): 

103 raise ValueError( 

104 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

105 ) 

106 

107 

108class TaskDef: 

109 """TaskDef is a collection of information about task needed by Pipeline. 

110 

111 The information includes task name, configuration object and optional 

112 task class. This class is just a collection of attributes and it exposes 

113 all of them so that attributes could potentially be modified in place 

114 (e.g. if configuration needs extra overrides). 

115 

116 Attributes 

117 ---------- 

118 taskName : `str`, optional 

119 `PipelineTask` class name, currently it is not specified whether this 

120 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 

121 Framework should be prepared to handle all cases. If not provided, 

122 ``taskClass`` must be, and ``taskClass.__name__`` is used. 

123 config : `lsst.pex.config.Config`, optional 

124 Instance of the configuration class corresponding to this task class, 

125 usually with all overrides applied. This config will be frozen. If 

126 not provided, ``taskClass`` must be provided and 

127 ``taskClass.ConfigClass()`` will be used. 

128 taskClass : `type`, optional 

129 `PipelineTask` class object, can be ``None``. If ``None`` then 

130 framework will have to locate and load class. 

131 label : `str`, optional 

132 Task label, usually a short string unique in a pipeline. If not 

133 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

134 be used. 

135 """ 

136 

137 def __init__( 

138 self, 

139 taskName: Optional[str] = None, 

140 config: Optional[Config] = None, 

141 taskClass: Optional[Type[PipelineTask]] = None, 

142 label: Optional[str] = None, 

143 ): 

144 if taskName is None: 

145 if taskClass is None: 

146 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

147 taskName = taskClass.__name__ 

148 if config is None: 

149 if taskClass is None: 

150 raise ValueError("`taskClass` must be provided if `config` is not.") 

151 config = taskClass.ConfigClass() 

152 if label is None: 

153 if taskClass is None: 

154 raise ValueError("`taskClass` must be provided if `label` is not.") 

155 label = taskClass._DefaultName 

156 self.taskName = taskName 

157 try: 

158 config.validate() 

159 except Exception: 

160 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

161 raise 

162 config.freeze() 

163 self.config = config 

164 self.taskClass = taskClass 

165 self.label = label 

166 self.connections = config.connections.ConnectionsClass(config=config) 

167 

168 @property 

169 def configDatasetName(self) -> str: 

170 """Name of a dataset type for configuration of this task (`str`)""" 

171 return self.label + "_config" 

172 

173 @property 

174 def metadataDatasetName(self) -> Optional[str]: 

175 """Name of a dataset type for metadata of this task, `None` if 

176 metadata is not to be saved (`str`) 

177 """ 

178 if self.config.saveMetadata: 

179 return self.label + "_metadata" 

180 else: 

181 return None 

182 

183 @property 

184 def logOutputDatasetName(self) -> Optional[str]: 

185 """Name of a dataset type for log output from this task, `None` if 

186 logs are not to be saved (`str`) 

187 """ 

188 if self.config.saveLogOutput: 

189 return self.label + "_log" 

190 else: 

191 return None 

192 

193 def __str__(self) -> str: 

194 rep = "TaskDef(" + self.taskName 

195 if self.label: 

196 rep += ", label=" + self.label 

197 rep += ")" 

198 return rep 

199 

200 def __eq__(self, other: object) -> bool: 

201 if not isinstance(other, TaskDef): 

202 return False 

203 # This does not consider equality of configs when determining equality 

204 # as config equality is a difficult thing to define. Should be updated 

205 # after DM-27847 

206 return self.taskClass == other.taskClass and self.label == other.label 

207 

208 def __hash__(self) -> int: 

209 return hash((self.taskClass, self.label)) 

210 

211 

212class Pipeline: 

213 """A `Pipeline` is a representation of a series of tasks to run, and the 

214 configuration for those tasks. 

215 

216 Parameters 

217 ---------- 

218 description : `str` 

219 A description of that this pipeline does. 

220 """ 

221 

222 def __init__(self, description: str): 

223 pipeline_dict = {"description": description, "tasks": {}} 

224 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

225 

226 @classmethod 

227 def fromFile(cls, filename: str) -> Pipeline: 

228 """Load a pipeline defined in a pipeline yaml file. 

229 

230 Parameters 

231 ---------- 

232 filename: `str` 

233 A path that points to a pipeline defined in yaml format. This 

234 filename may also supply additional labels to be used in 

235 subsetting the loaded Pipeline. These labels are separated from 

236 the path by a \\#, and may be specified as a comma separated 

237 list, or a range denoted as beginning..end. Beginning or end may 

238 be empty, in which case the range will be a half open interval. 

239 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

240 that range based selection is not well defined for pipelines that 

241 are not linear in nature, and correct behavior is not guaranteed, 

242 or may vary from run to run. 

243 

244 Returns 

245 ------- 

246 pipeline: `Pipeline` 

247 The pipeline loaded from specified location with appropriate (if 

248 any) subsetting 

249 

250 Notes 

251 ----- 

252 This method attempts to prune any contracts that contain labels which 

253 are not in the declared subset of labels. This pruning is done using a 

254 string based matching due to the nature of contracts and may prune more 

255 than it should. 

256 """ 

257 return cls.from_uri(filename) 

258 

259 @classmethod 

260 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

261 """Load a pipeline defined in a pipeline yaml file at a location 

262 specified by a URI. 

263 

264 Parameters 

265 ---------- 

266 uri: convertible to `ResourcePath` 

267 If a string is supplied this should be a URI path that points to a 

268 pipeline defined in yaml format, either as a direct path to the 

269 yaml file, or as a directory containing a "pipeline.yaml" file (the 

270 form used by `write_to_uri` with ``expand=True``). This uri may 

271 also supply additional labels to be used in subsetting the loaded 

272 Pipeline. These labels are separated from the path by a \\#, and 

273 may be specified as a comma separated list, or a range denoted as 

274 beginning..end. Beginning or end may be empty, in which case the 

275 range will be a half open interval. Unlike python iteration bounds, 

276 end bounds are *INCLUDED*. Note that range based selection is not 

277 well defined for pipelines that are not linear in nature, and 

278 correct behavior is not guaranteed, or may vary from run to run. 

279 The same specifiers can be used with a `ResourcePath` object, by 

280 being the sole contents in the fragments attribute. 

281 

282 Returns 

283 ------- 

284 pipeline: `Pipeline` 

285 The pipeline loaded from specified location with appropriate (if 

286 any) subsetting 

287 

288 Notes 

289 ----- 

290 This method attempts to prune any contracts that contain labels which 

291 are not in the declared subset of labels. This pruning is done using a 

292 string based matching due to the nature of contracts and may prune more 

293 than it should. 

294 """ 

295 # Split up the uri and any labels that were supplied 

296 uri, label_specifier = cls._parse_file_specifier(uri) 

297 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

298 

299 # If there are labels supplied, only keep those 

300 if label_specifier is not None: 

301 pipeline = pipeline.subsetFromLabels(label_specifier) 

302 return pipeline 

303 

304 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

305 """Subset a pipeline to contain only labels specified in labelSpecifier 

306 

307 Parameters 

308 ---------- 

309 labelSpecifier : `labelSpecifier` 

310 Object containing labels that describes how to subset a pipeline. 

311 

312 Returns 

313 ------- 

314 pipeline : `Pipeline` 

315 A new pipeline object that is a subset of the old pipeline 

316 

317 Raises 

318 ------ 

319 ValueError 

320 Raised if there is an issue with specified labels 

321 

322 Notes 

323 ----- 

324 This method attempts to prune any contracts that contain labels which 

325 are not in the declared subset of labels. This pruning is done using a 

326 string based matching due to the nature of contracts and may prune more 

327 than it should. 

328 """ 

329 # Labels supplied as a set 

330 if labelSpecifier.labels: 

331 labelSet = labelSpecifier.labels 

332 # Labels supplied as a range, first create a list of all the labels 

333 # in the pipeline sorted according to task dependency. Then only 

334 # keep labels that lie between the supplied bounds 

335 else: 

336 # Create a copy of the pipeline to use when assessing the label 

337 # ordering. Use a dict for fast searching while preserving order. 

338 # Remove contracts so they do not fail in the expansion step. This 

339 # is needed because a user may only configure the tasks they intend 

340 # to run, which may cause some contracts to fail if they will later 

341 # be dropped 

342 pipeline = copy.deepcopy(self) 

343 pipeline._pipelineIR.contracts = [] 

344 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

345 

346 # Verify the bounds are in the labels 

347 if labelSpecifier.begin is not None: 

348 if labelSpecifier.begin not in labels: 

349 raise ValueError( 

350 f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

351 "pipeline definition" 

352 ) 

353 if labelSpecifier.end is not None: 

354 if labelSpecifier.end not in labels: 

355 raise ValueError( 

356 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

357 ) 

358 

359 labelSet = set() 

360 for label in labels: 

361 if labelSpecifier.begin is not None: 

362 if label != labelSpecifier.begin: 

363 continue 

364 else: 

365 labelSpecifier.begin = None 

366 labelSet.add(label) 

367 if labelSpecifier.end is not None and label == labelSpecifier.end: 

368 break 

369 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

370 

371 @staticmethod 

372 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]: 

373 """Split appart a uri and any possible label subsets""" 

374 if isinstance(uri, str): 

375 # This is to support legacy pipelines during transition 

376 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

377 if num_replace: 

378 warnings.warn( 

379 f"The pipeline file {uri} seems to use the legacy : to separate " 

380 "labels, this is deprecated and will be removed after June 2021, please use " 

381 "# instead.", 

382 category=FutureWarning, 

383 ) 

384 if uri.count("#") > 1: 

385 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

386 # Everything else can be converted directly to ResourcePath. 

387 uri = ResourcePath(uri) 

388 label_subset = uri.fragment or None 

389 

390 specifier: Optional[LabelSpecifier] 

391 if label_subset is not None: 

392 label_subset = urllib.parse.unquote(label_subset) 

393 args: Dict[str, Union[Set[str], str, None]] 

394 # labels supplied as a list 

395 if "," in label_subset: 

396 if ".." in label_subset: 

397 raise ValueError( 

398 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

399 ) 

400 args = {"labels": set(label_subset.split(","))} 

401 # labels supplied as a range 

402 elif ".." in label_subset: 

403 # Try to de-structure the labelSubset, this will fail if more 

404 # than one range is specified 

405 begin, end, *rest = label_subset.split("..") 

406 if rest: 

407 raise ValueError("Only one range can be specified when loading a pipeline") 

408 args = {"begin": begin if begin else None, "end": end if end else None} 

409 # Assume anything else is a single label 

410 else: 

411 args = {"labels": {label_subset}} 

412 

413 # MyPy doesn't like how cavalier kwarg construction is with types. 

414 specifier = LabelSpecifier(**args) # type: ignore 

415 else: 

416 specifier = None 

417 

418 return uri, specifier 

419 

420 @classmethod 

421 def fromString(cls, pipeline_string: str) -> Pipeline: 

422 """Create a pipeline from string formatted as a pipeline document. 

423 

424 Parameters 

425 ---------- 

426 pipeline_string : `str` 

427 A string that is formatted according like a pipeline document 

428 

429 Returns 

430 ------- 

431 pipeline: `Pipeline` 

432 """ 

433 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

434 return pipeline 

435 

436 @classmethod 

437 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

438 """Create a pipeline from an already created `PipelineIR` object. 

439 

440 Parameters 

441 ---------- 

442 deserialized_pipeline: `PipelineIR` 

443 An already created pipeline intermediate representation object 

444 

445 Returns 

446 ------- 

447 pipeline: `Pipeline` 

448 """ 

449 pipeline = cls.__new__(cls) 

450 pipeline._pipelineIR = deserialized_pipeline 

451 return pipeline 

452 

453 @classmethod 

454 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

455 """Create a new pipeline by copying an already existing `Pipeline`. 

456 

457 Parameters 

458 ---------- 

459 pipeline: `Pipeline` 

460 An already created pipeline intermediate representation object 

461 

462 Returns 

463 ------- 

464 pipeline: `Pipeline` 

465 """ 

466 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

467 

468 def __str__(self) -> str: 

469 # tasks need sorted each call because someone might have added or 

470 # removed task, and caching changes does not seem worth the small 

471 # overhead 

472 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

473 self._pipelineIR.reorder_tasks(labels) 

474 return str(self._pipelineIR) 

475 

476 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

477 """Add an instrument to the pipeline, or replace an instrument that is 

478 already defined. 

479 

480 Parameters 

481 ---------- 

482 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

483 Either a derived class object of a `lsst.daf.butler.instrument` or 

484 a string corresponding to a fully qualified 

485 `lsst.daf.butler.instrument` name. 

486 """ 

487 if isinstance(instrument, str): 

488 pass 

489 else: 

490 # TODO: assume that this is a subclass of Instrument, no type 

491 # checking 

492 instrument = get_full_type_name(instrument) 

493 self._pipelineIR.instrument = instrument 

494 

495 def getInstrument(self) -> Optional[str]: 

496 """Get the instrument from the pipeline. 

497 

498 Returns 

499 ------- 

500 instrument : `str`, or None 

501 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

502 name, or None if the pipeline does not have an instrument. 

503 """ 

504 return self._pipelineIR.instrument 

505 

506 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None: 

507 """Add a new task to the pipeline, or replace a task that is already 

508 associated with the supplied label. 

509 

510 Parameters 

511 ---------- 

512 task: `PipelineTask` or `str` 

513 Either a derived class object of a `PipelineTask` or a string 

514 corresponding to a fully qualified `PipelineTask` name. 

515 label: `str` 

516 A label that is used to identify the `PipelineTask` being added 

517 """ 

518 if isinstance(task, str): 

519 taskName = task 

520 elif issubclass(task, PipelineTask): 

521 taskName = get_full_type_name(task) 

522 else: 

523 raise ValueError( 

524 "task must be either a child class of PipelineTask or a string containing" 

525 " a fully qualified name to one" 

526 ) 

527 if not label: 

528 # in some cases (with command line-generated pipeline) tasks can 

529 # be defined without label which is not acceptable, use task 

530 # _DefaultName in that case 

531 if isinstance(task, str): 

532 task_class = doImportType(task) 

533 label = task_class._DefaultName 

534 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

535 

536 def removeTask(self, label: str) -> None: 

537 """Remove a task from the pipeline. 

538 

539 Parameters 

540 ---------- 

541 label : `str` 

542 The label used to identify the task that is to be removed 

543 

544 Raises 

545 ------ 

546 KeyError 

547 If no task with that label exists in the pipeline 

548 

549 """ 

550 self._pipelineIR.tasks.pop(label) 

551 

552 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

553 """Apply single config override. 

554 

555 Parameters 

556 ---------- 

557 label : `str` 

558 Label of the task. 

559 key: `str` 

560 Fully-qualified field name. 

561 value : object 

562 Value to be given to a field. 

563 """ 

564 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

565 

566 def addConfigFile(self, label: str, filename: str) -> None: 

567 """Add overrides from a specified file. 

568 

569 Parameters 

570 ---------- 

571 label : `str` 

572 The label used to identify the task associated with config to 

573 modify 

574 filename : `str` 

575 Path to the override file. 

576 """ 

577 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

578 

579 def addConfigPython(self, label: str, pythonString: str) -> None: 

580 """Add Overrides by running a snippet of python code against a config. 

581 

582 Parameters 

583 ---------- 

584 label : `str` 

585 The label used to identity the task associated with config to 

586 modify. 

587 pythonString: `str` 

588 A string which is valid python code to be executed. This is done 

589 with config as the only local accessible value. 

590 """ 

591 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

592 

593 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

594 if label == "parameters": 

595 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

596 raise ValueError("Cannot override parameters that are not defined in pipeline") 

597 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

598 if newConfig.file: 

599 raise ValueError("Setting parameters section with config file is not supported") 

600 if newConfig.python: 

601 raise ValueError("Setting parameters section using python block in unsupported") 

602 return 

603 if label not in self._pipelineIR.tasks: 

604 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

605 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

606 

607 def toFile(self, filename: str) -> None: 

608 self._pipelineIR.to_file(filename) 

609 

610 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

611 """Write the pipeline to a file or directory. 

612 

613 Parameters 

614 ---------- 

615 uri : convertible to `ResourcePath` 

616 URI to write to; may have any scheme with `ResourcePath` write 

617 support or no scheme for a local file/directory. Should have a 

618 ``.yaml``. 

619 """ 

620 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

621 self._pipelineIR.reorder_tasks(labels) 

622 self._pipelineIR.write_to_uri(uri) 

623 

624 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

625 """Returns a generator of TaskDefs which can be used to create quantum 

626 graphs. 

627 

628 Returns 

629 ------- 

630 generator : generator of `TaskDef` 

631 The generator returned will be the sorted iterator of tasks which 

632 are to be used in constructing a quantum graph. 

633 

634 Raises 

635 ------ 

636 NotImplementedError 

637 If a dataId is supplied in a config block. This is in place for 

638 future use 

639 """ 

640 yield from self._toExpandedPipelineImpl() 

641 

642 def _toExpandedPipelineImpl(self, checkContracts: bool = True) -> Iterable[TaskDef]: 

643 taskDefs = [] 

644 for label in self._pipelineIR.tasks: 

645 taskDefs.append(self._buildTaskDef(label)) 

646 

647 # lets evaluate the contracts 

648 if self._pipelineIR.contracts is not None: 

649 label_to_config = {x.label: x.config for x in taskDefs} 

650 for contract in self._pipelineIR.contracts: 

651 # execute this in its own line so it can raise a good error 

652 # message if there was problems with the eval 

653 success = eval(contract.contract, None, label_to_config) 

654 if not success: 

655 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

656 raise pipelineIR.ContractError( 

657 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

658 ) 

659 

660 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

661 yield from pipeTools.orderPipeline(taskDefs) 

662 

663 def _buildTaskDef(self, label: str) -> TaskDef: 

664 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

665 raise NameError(f"Label {label} does not appear in this pipeline") 

666 taskClass: Type[PipelineTask] = doImportType(taskIR.klass) 

667 taskName = taskClass.__qualname__ 

668 config = taskClass.ConfigClass() 

669 overrides = ConfigOverrides() 

670 if self._pipelineIR.instrument is not None: 

671 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

672 if taskIR.config is not None: 

673 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config): 

674 if configIR.dataId is not None: 

675 raise NotImplementedError( 

676 "Specializing a config on a partial data id is not yet " 

677 "supported in Pipeline definition" 

678 ) 

679 # only apply override if it applies to everything 

680 if configIR.dataId is None: 

681 if configIR.file: 

682 for configFile in configIR.file: 

683 overrides.addFileOverride(os.path.expandvars(configFile)) 

684 if configIR.python is not None: 

685 overrides.addPythonOverride(configIR.python) 

686 for key, value in configIR.rest.items(): 

687 overrides.addValueOverride(key, value) 

688 overrides.applyTo(config) 

689 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

690 

691 def __iter__(self) -> Generator[TaskDef, None, None]: 

692 return self.toExpandedPipeline() 

693 

694 def __getitem__(self, item: str) -> TaskDef: 

695 return self._buildTaskDef(item) 

696 

697 def __len__(self) -> int: 

698 return len(self._pipelineIR.tasks) 

699 

700 def __eq__(self, other: object) -> bool: 

701 if not isinstance(other, Pipeline): 

702 return False 

703 return self._pipelineIR == other._pipelineIR 

704 

705 

706@dataclass(frozen=True) 

707class TaskDatasetTypes: 

708 """An immutable struct that extracts and classifies the dataset types used 

709 by a `PipelineTask` 

710 """ 

711 

712 initInputs: NamedValueSet[DatasetType] 

713 """Dataset types that are needed as inputs in order to construct this Task. 

714 

715 Task-level `initInputs` may be classified as either 

716 `~PipelineDatasetTypes.initInputs` or 

717 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

718 """ 

719 

720 initOutputs: NamedValueSet[DatasetType] 

721 """Dataset types that may be written after constructing this Task. 

722 

723 Task-level `initOutputs` may be classified as either 

724 `~PipelineDatasetTypes.initOutputs` or 

725 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

726 """ 

727 

728 inputs: NamedValueSet[DatasetType] 

729 """Dataset types that are regular inputs to this Task. 

730 

731 If an input dataset needed for a Quantum cannot be found in the input 

732 collection(s) or produced by another Task in the Pipeline, that Quantum 

733 (and all dependent Quanta) will not be produced. 

734 

735 Task-level `inputs` may be classified as either 

736 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

737 at the Pipeline level. 

738 """ 

739 

740 prerequisites: NamedValueSet[DatasetType] 

741 """Dataset types that are prerequisite inputs to this Task. 

742 

743 Prerequisite inputs must exist in the input collection(s) before the 

744 pipeline is run, but do not constrain the graph - if a prerequisite is 

745 missing for a Quantum, `PrerequisiteMissingError` is raised. 

746 

747 Prerequisite inputs are not resolved until the second stage of 

748 QuantumGraph generation. 

749 """ 

750 

751 outputs: NamedValueSet[DatasetType] 

752 """Dataset types that are produced by this Task. 

753 

754 Task-level `outputs` may be classified as either 

755 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

756 at the Pipeline level. 

757 """ 

758 

759 @classmethod 

760 def fromTaskDef( 

761 cls, 

762 taskDef: TaskDef, 

763 *, 

764 registry: Registry, 

765 include_configs: bool = True, 

766 storage_class_mapping: Optional[Mapping[str, str]] = None, 

767 ) -> TaskDatasetTypes: 

768 """Extract and classify the dataset types from a single `PipelineTask`. 

769 

770 Parameters 

771 ---------- 

772 taskDef: `TaskDef` 

773 An instance of a `TaskDef` class for a particular `PipelineTask`. 

774 registry: `Registry` 

775 Registry used to construct normalized `DatasetType` objects and 

776 retrieve those that are incomplete. 

777 include_configs : `bool`, optional 

778 If `True` (default) include config dataset types as 

779 ``initOutputs``. 

780 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

781 If a taskdef contains a component dataset type that is unknown 

782 to the registry, its parent StorageClass will be looked up in this 

783 mapping if it is supplied. If the mapping does not contain the 

784 composite dataset type, or the mapping is not supplied an exception 

785 will be raised. 

786 

787 Returns 

788 ------- 

789 types: `TaskDatasetTypes` 

790 The dataset types used by this task. 

791 

792 Raises 

793 ------ 

794 ValueError 

795 Raised if dataset type connection definition differs from 

796 registry definition. 

797 LookupError 

798 Raised if component parent StorageClass could not be determined 

799 and storage_class_mapping does not contain the composite type, or 

800 is set to None. 

801 """ 

802 

803 def makeDatasetTypesSet( 

804 connectionType: str, 

805 is_input: bool, 

806 freeze: bool = True, 

807 ) -> NamedValueSet[DatasetType]: 

808 """Constructs a set of true `DatasetType` objects 

809 

810 Parameters 

811 ---------- 

812 connectionType : `str` 

813 Name of the connection type to produce a set for, corresponds 

814 to an attribute of type `list` on the connection class instance 

815 is_input : `bool` 

816 These are input dataset types, else they are output dataset 

817 types. 

818 freeze : `bool`, optional 

819 If `True`, call `NamedValueSet.freeze` on the object returned. 

820 

821 Returns 

822 ------- 

823 datasetTypes : `NamedValueSet` 

824 A set of all datasetTypes which correspond to the input 

825 connection type specified in the connection class of this 

826 `PipelineTask` 

827 

828 Raises 

829 ------ 

830 ValueError 

831 Raised if dataset type connection definition differs from 

832 registry definition. 

833 LookupError 

834 Raised if component parent StorageClass could not be determined 

835 and storage_class_mapping does not contain the composite type, 

836 or is set to None. 

837 

838 Notes 

839 ----- 

840 This function is a closure over the variables ``registry`` and 

841 ``taskDef``, and ``storage_class_mapping``. 

842 """ 

843 datasetTypes = NamedValueSet[DatasetType]() 

844 for c in iterConnections(taskDef.connections, connectionType): 

845 dimensions = set(getattr(c, "dimensions", set())) 

846 if "skypix" in dimensions: 

847 try: 

848 datasetType = registry.getDatasetType(c.name) 

849 except LookupError as err: 

850 raise LookupError( 

851 f"DatasetType '{c.name}' referenced by " 

852 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

853 f"placeholder, but does not already exist in the registry. " 

854 f"Note that reference catalog names are now used as the dataset " 

855 f"type name instead of 'ref_cat'." 

856 ) from err 

857 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

858 rest2 = set( 

859 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

860 ) 

861 if rest1 != rest2: 

862 raise ValueError( 

863 f"Non-skypix dimensions for dataset type {c.name} declared in " 

864 f"connections ({rest1}) are inconsistent with those in " 

865 f"registry's version of this dataset ({rest2})." 

866 ) 

867 else: 

868 # Component dataset types are not explicitly in the 

869 # registry. This complicates consistency checks with 

870 # registry and requires we work out the composite storage 

871 # class. 

872 registryDatasetType = None 

873 try: 

874 registryDatasetType = registry.getDatasetType(c.name) 

875 except KeyError: 

876 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

877 if componentName: 

878 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

879 raise LookupError( 

880 "Component parent class cannot be determined, and " 

881 "composite name was not in storage class mapping, or no " 

882 "storage_class_mapping was supplied" 

883 ) 

884 else: 

885 parentStorageClass = storage_class_mapping[compositeName] 

886 else: 

887 parentStorageClass = None 

888 datasetType = c.makeDatasetType( 

889 registry.dimensions, parentStorageClass=parentStorageClass 

890 ) 

891 registryDatasetType = datasetType 

892 else: 

893 datasetType = c.makeDatasetType( 

894 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

895 ) 

896 

897 if registryDatasetType and datasetType != registryDatasetType: 

898 # The dataset types differ but first check to see if 

899 # they are compatible before raising. 

900 if is_input: 

901 # This DatasetType must be compatible on get. 

902 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

903 else: 

904 # Has to be able to be converted to expect type 

905 # on put. 

906 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

907 if is_compatible: 

908 # For inputs we want the pipeline to use the 

909 # pipeline definition, for outputs it should use 

910 # the registry definition. 

911 if not is_input: 

912 datasetType = registryDatasetType 

913 _LOG.debug( 

914 "Dataset types differ (task %s != registry %s) but are compatible" 

915 " for %s in %s.", 

916 datasetType, 

917 registryDatasetType, 

918 "input" if is_input else "output", 

919 taskDef.label, 

920 ) 

921 else: 

922 try: 

923 # Explicitly check for storage class just to 

924 # make more specific message. 

925 _ = datasetType.storageClass 

926 except KeyError: 

927 raise ValueError( 

928 "Storage class does not exist for supplied dataset type " 

929 f"{datasetType} for {taskDef.label}." 

930 ) from None 

931 raise ValueError( 

932 f"Supplied dataset type ({datasetType}) inconsistent with " 

933 f"registry definition ({registryDatasetType}) " 

934 f"for {taskDef.label}." 

935 ) 

936 datasetTypes.add(datasetType) 

937 if freeze: 

938 datasetTypes.freeze() 

939 return datasetTypes 

940 

941 # optionally add initOutput dataset for config 

942 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

943 if include_configs: 

944 initOutputs.add( 

945 DatasetType( 

946 taskDef.configDatasetName, 

947 registry.dimensions.empty, 

948 storageClass="Config", 

949 ) 

950 ) 

951 initOutputs.freeze() 

952 

953 # optionally add output dataset for metadata 

954 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

955 if taskDef.metadataDatasetName is not None: 

956 # Metadata is supposed to be of the TaskMetadata type, its 

957 # dimensions correspond to a task quantum. 

958 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

959 

960 # Allow the storage class definition to be read from the existing 

961 # dataset type definition if present. 

962 try: 

963 current = registry.getDatasetType(taskDef.metadataDatasetName) 

964 except KeyError: 

965 # No previous definition so use the default. 

966 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

967 else: 

968 storageClass = current.storageClass.name 

969 

970 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

971 if taskDef.logOutputDatasetName is not None: 

972 # Log output dimensions correspond to a task quantum. 

973 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

974 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}) 

975 

976 outputs.freeze() 

977 

978 return cls( 

979 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

980 initOutputs=initOutputs, 

981 inputs=makeDatasetTypesSet("inputs", is_input=True), 

982 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

983 outputs=outputs, 

984 ) 

985 

986 

987@dataclass(frozen=True) 

988class PipelineDatasetTypes: 

989 """An immutable struct that classifies the dataset types used in a 

990 `Pipeline`. 

991 """ 

992 

993 packagesDatasetName: ClassVar[str] = "packages" 

994 """Name of a dataset type used to save package versions. 

995 """ 

996 

997 initInputs: NamedValueSet[DatasetType] 

998 """Dataset types that are needed as inputs in order to construct the Tasks 

999 in this Pipeline. 

1000 

1001 This does not include dataset types that are produced when constructing 

1002 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1003 """ 

1004 

1005 initOutputs: NamedValueSet[DatasetType] 

1006 """Dataset types that may be written after constructing the Tasks in this 

1007 Pipeline. 

1008 

1009 This does not include dataset types that are also used as inputs when 

1010 constructing other Tasks in the Pipeline (these are classified as 

1011 `initIntermediates`). 

1012 """ 

1013 

1014 initIntermediates: NamedValueSet[DatasetType] 

1015 """Dataset types that are both used when constructing one or more Tasks 

1016 in the Pipeline and produced as a side-effect of constructing another 

1017 Task in the Pipeline. 

1018 """ 

1019 

1020 inputs: NamedValueSet[DatasetType] 

1021 """Dataset types that are regular inputs for the full pipeline. 

1022 

1023 If an input dataset needed for a Quantum cannot be found in the input 

1024 collection(s), that Quantum (and all dependent Quanta) will not be 

1025 produced. 

1026 """ 

1027 

1028 prerequisites: NamedValueSet[DatasetType] 

1029 """Dataset types that are prerequisite inputs for the full Pipeline. 

1030 

1031 Prerequisite inputs must exist in the input collection(s) before the 

1032 pipeline is run, but do not constrain the graph - if a prerequisite is 

1033 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1034 

1035 Prerequisite inputs are not resolved until the second stage of 

1036 QuantumGraph generation. 

1037 """ 

1038 

1039 intermediates: NamedValueSet[DatasetType] 

1040 """Dataset types that are output by one Task in the Pipeline and consumed 

1041 as inputs by one or more other Tasks in the Pipeline. 

1042 """ 

1043 

1044 outputs: NamedValueSet[DatasetType] 

1045 """Dataset types that are output by a Task in the Pipeline and not consumed 

1046 by any other Task in the Pipeline. 

1047 """ 

1048 

1049 byTask: Mapping[str, TaskDatasetTypes] 

1050 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1051 

1052 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1053 neither has been modified since the dataset types were extracted, of 

1054 course). 

1055 """ 

1056 

1057 @classmethod 

1058 def fromPipeline( 

1059 cls, 

1060 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1061 *, 

1062 registry: Registry, 

1063 include_configs: bool = True, 

1064 include_packages: bool = True, 

1065 ) -> PipelineDatasetTypes: 

1066 """Extract and classify the dataset types from all tasks in a 

1067 `Pipeline`. 

1068 

1069 Parameters 

1070 ---------- 

1071 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1072 A collection of tasks that can be run together. 

1073 registry: `Registry` 

1074 Registry used to construct normalized `DatasetType` objects and 

1075 retrieve those that are incomplete. 

1076 include_configs : `bool`, optional 

1077 If `True` (default) include config dataset types as 

1078 ``initOutputs``. 

1079 include_packages : `bool`, optional 

1080 If `True` (default) include the dataset type for software package 

1081 versions in ``initOutputs``. 

1082 

1083 Returns 

1084 ------- 

1085 types: `PipelineDatasetTypes` 

1086 The dataset types used by this `Pipeline`. 

1087 

1088 Raises 

1089 ------ 

1090 ValueError 

1091 Raised if Tasks are inconsistent about which datasets are marked 

1092 prerequisite. This indicates that the Tasks cannot be run as part 

1093 of the same `Pipeline`. 

1094 """ 

1095 allInputs = NamedValueSet[DatasetType]() 

1096 allOutputs = NamedValueSet[DatasetType]() 

1097 allInitInputs = NamedValueSet[DatasetType]() 

1098 allInitOutputs = NamedValueSet[DatasetType]() 

1099 prerequisites = NamedValueSet[DatasetType]() 

1100 byTask = dict() 

1101 if include_packages: 

1102 allInitOutputs.add( 

1103 DatasetType( 

1104 cls.packagesDatasetName, 

1105 registry.dimensions.empty, 

1106 storageClass="Packages", 

1107 ) 

1108 ) 

1109 # create a list of TaskDefs in case the input is a generator 

1110 pipeline = list(pipeline) 

1111 

1112 # collect all the output dataset types 

1113 typeStorageclassMap: Dict[str, str] = {} 

1114 for taskDef in pipeline: 

1115 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1116 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1117 

1118 for taskDef in pipeline: 

1119 thisTask = TaskDatasetTypes.fromTaskDef( 

1120 taskDef, 

1121 registry=registry, 

1122 include_configs=include_configs, 

1123 storage_class_mapping=typeStorageclassMap, 

1124 ) 

1125 allInitInputs.update(thisTask.initInputs) 

1126 allInitOutputs.update(thisTask.initOutputs) 

1127 allInputs.update(thisTask.inputs) 

1128 prerequisites.update(thisTask.prerequisites) 

1129 allOutputs.update(thisTask.outputs) 

1130 byTask[taskDef.label] = thisTask 

1131 if not prerequisites.isdisjoint(allInputs): 

1132 raise ValueError( 

1133 "{} marked as both prerequisites and regular inputs".format( 

1134 {dt.name for dt in allInputs & prerequisites} 

1135 ) 

1136 ) 

1137 if not prerequisites.isdisjoint(allOutputs): 

1138 raise ValueError( 

1139 "{} marked as both prerequisites and outputs".format( 

1140 {dt.name for dt in allOutputs & prerequisites} 

1141 ) 

1142 ) 

1143 # Make sure that components which are marked as inputs get treated as 

1144 # intermediates if there is an output which produces the composite 

1145 # containing the component 

1146 intermediateComponents = NamedValueSet[DatasetType]() 

1147 intermediateComposites = NamedValueSet[DatasetType]() 

1148 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1149 for dsType in allInputs: 

1150 # get the name of a possible component 

1151 name, component = dsType.nameAndComponent() 

1152 # if there is a component name, that means this is a component 

1153 # DatasetType, if there is an output which produces the parent of 

1154 # this component, treat this input as an intermediate 

1155 if component is not None: 

1156 # This needs to be in this if block, because someone might have 

1157 # a composite that is a pure input from existing data 

1158 if name in outputNameMapping: 

1159 intermediateComponents.add(dsType) 

1160 intermediateComposites.add(outputNameMapping[name]) 

1161 

1162 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1163 common = a.names & b.names 

1164 for name in common: 

1165 # Any compatibility is allowed. This function does not know 

1166 # if a dataset type is to be used for input or output. 

1167 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1168 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1169 

1170 checkConsistency(allInitInputs, allInitOutputs) 

1171 checkConsistency(allInputs, allOutputs) 

1172 checkConsistency(allInputs, intermediateComposites) 

1173 checkConsistency(allOutputs, intermediateComposites) 

1174 

1175 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]: 

1176 assert isinstance(s, NamedValueSet) 

1177 s.freeze() 

1178 return s 

1179 

1180 return cls( 

1181 initInputs=frozen(allInitInputs - allInitOutputs), 

1182 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1183 initOutputs=frozen(allInitOutputs - allInitInputs), 

1184 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1185 # If there are storage class differences in inputs and outputs 

1186 # the intermediates have to choose priority. Here choose that 

1187 # inputs to tasks much match the requested storage class by 

1188 # applying the inputs over the top of the outputs. 

1189 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1190 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1191 prerequisites=frozen(prerequisites), 

1192 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1193 ) 

1194 

1195 @classmethod 

1196 def initOutputNames( 

1197 cls, 

1198 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1199 *, 

1200 include_configs: bool = True, 

1201 include_packages: bool = True, 

1202 ) -> Iterator[str]: 

1203 """Return the names of dataset types ot task initOutputs, Configs, 

1204 and package versions for a pipeline. 

1205 

1206 Parameters 

1207 ---------- 

1208 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1209 A `Pipeline` instance or collection of `TaskDef` instances. 

1210 include_configs : `bool`, optional 

1211 If `True` (default) include config dataset types. 

1212 include_packages : `bool`, optional 

1213 If `True` (default) include the dataset type for package versions. 

1214 

1215 Yields 

1216 ------ 

1217 datasetTypeName : `str` 

1218 Name of the dataset type. 

1219 """ 

1220 if include_packages: 

1221 # Package versions dataset type 

1222 yield cls.packagesDatasetName 

1223 

1224 if isinstance(pipeline, Pipeline): 

1225 pipeline = pipeline.toExpandedPipeline() 

1226 

1227 for taskDef in pipeline: 

1228 

1229 # all task InitOutputs 

1230 for name in taskDef.connections.initOutputs: 

1231 attribute = getattr(taskDef.connections, name) 

1232 yield attribute.name 

1233 

1234 # config dataset name 

1235 if include_configs: 

1236 yield taskDef.configDatasetName