Coverage for python/lsst/pipe/base/pipeline.py: 18%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

383 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import os 

31import re 

32import urllib.parse 

33import warnings 

34 

35# ------------------------------- 

36# Imports of standard modules -- 

37# ------------------------------- 

38from dataclasses import dataclass 

39from types import MappingProxyType 

40from typing import ( 

41 TYPE_CHECKING, 

42 ClassVar, 

43 Dict, 

44 Generator, 

45 Iterable, 

46 Iterator, 

47 Mapping, 

48 Optional, 

49 Set, 

50 Tuple, 

51 Union, 

52) 

53 

54# ----------------------------- 

55# Imports for other modules -- 

56from lsst.daf.butler import ButlerURI, DatasetType, NamedValueSet, Registry, SkyPixDimension 

57from lsst.utils import doImport 

58 

59from . import pipelineIR, pipeTools 

60from ._task_metadata import TaskMetadata 

61from .configOverrides import ConfigOverrides 

62from .connections import iterConnections 

63from .pipelineTask import PipelineTask 

64from .task import _TASK_METADATA_TYPE 

65 

66if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 66 ↛ 67line 66 didn't jump to line 67, because the condition on line 66 was never true

67 from lsst.obs.base import Instrument 

68 

69# ---------------------------------- 

70# Local non-exported definitions -- 

71# ---------------------------------- 

72 

73_LOG = logging.getLogger(__name__) 

74 

75# ------------------------ 

76# Exported definitions -- 

77# ------------------------ 

78 

79 

80@dataclass 

81class LabelSpecifier: 

82 """A structure to specify a subset of labels to load 

83 

84 This structure may contain a set of labels to be used in subsetting a 

85 pipeline, or a beginning and end point. Beginning or end may be empty, 

86 in which case the range will be a half open interval. Unlike python 

87 iteration bounds, end bounds are *INCLUDED*. Note that range based 

88 selection is not well defined for pipelines that are not linear in nature, 

89 and correct behavior is not guaranteed, or may vary from run to run. 

90 """ 

91 

92 labels: Optional[Set[str]] = None 

93 begin: Optional[str] = None 

94 end: Optional[str] = None 

95 

96 def __post_init__(self): 

97 if self.labels is not None and (self.begin or self.end): 

98 raise ValueError( 

99 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

100 ) 

101 

102 

103class TaskDef: 

104 """TaskDef is a collection of information about task needed by Pipeline. 

105 

106 The information includes task name, configuration object and optional 

107 task class. This class is just a collection of attributes and it exposes 

108 all of them so that attributes could potentially be modified in place 

109 (e.g. if configuration needs extra overrides). 

110 

111 Attributes 

112 ---------- 

113 taskName : `str`, optional 

114 `PipelineTask` class name, currently it is not specified whether this 

115 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 

116 Framework should be prepared to handle all cases. If not provided, 

117 ``taskClass`` must be, and ``taskClass.__name__`` is used. 

118 config : `lsst.pex.config.Config`, optional 

119 Instance of the configuration class corresponding to this task class, 

120 usually with all overrides applied. This config will be frozen. If 

121 not provided, ``taskClass`` must be provided and 

122 ``taskClass.ConfigClass()`` will be used. 

123 taskClass : `type`, optional 

124 `PipelineTask` class object, can be ``None``. If ``None`` then 

125 framework will have to locate and load class. 

126 label : `str`, optional 

127 Task label, usually a short string unique in a pipeline. If not 

128 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

129 be used. 

130 """ 

131 

132 def __init__(self, taskName=None, config=None, taskClass=None, label=None): 

133 if taskName is None: 

134 if taskClass is None: 

135 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

136 taskName = taskClass.__name__ 

137 if config is None: 

138 if taskClass is None: 

139 raise ValueError("`taskClass` must be provided if `config` is not.") 

140 config = taskClass.ConfigClass() 

141 if label is None: 

142 if taskClass is None: 

143 raise ValueError("`taskClass` must be provided if `label` is not.") 

144 label = taskClass._DefaultName 

145 self.taskName = taskName 

146 try: 

147 config.validate() 

148 except Exception: 

149 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

150 raise 

151 config.freeze() 

152 self.config = config 

153 self.taskClass = taskClass 

154 self.label = label 

155 self.connections = config.connections.ConnectionsClass(config=config) 

156 

157 @property 

158 def configDatasetName(self) -> str: 

159 """Name of a dataset type for configuration of this task (`str`)""" 

160 return self.label + "_config" 

161 

162 @property 

163 def metadataDatasetName(self) -> Optional[str]: 

164 """Name of a dataset type for metadata of this task, `None` if 

165 metadata is not to be saved (`str`) 

166 """ 

167 if self.config.saveMetadata: 

168 return self.label + "_metadata" 

169 else: 

170 return None 

171 

172 @property 

173 def logOutputDatasetName(self) -> Optional[str]: 

174 """Name of a dataset type for log output from this task, `None` if 

175 logs are not to be saved (`str`) 

176 """ 

177 if self.config.saveLogOutput: 

178 return self.label + "_log" 

179 else: 

180 return None 

181 

182 def __str__(self): 

183 rep = "TaskDef(" + self.taskName 

184 if self.label: 

185 rep += ", label=" + self.label 

186 rep += ")" 

187 return rep 

188 

189 def __eq__(self, other: object) -> bool: 

190 if not isinstance(other, TaskDef): 

191 return False 

192 # This does not consider equality of configs when determining equality 

193 # as config equality is a difficult thing to define. Should be updated 

194 # after DM-27847 

195 return self.taskClass == other.taskClass and self.label == other.label 

196 

197 def __hash__(self): 

198 return hash((self.taskClass, self.label)) 

199 

200 

201class Pipeline: 

202 """A `Pipeline` is a representation of a series of tasks to run, and the 

203 configuration for those tasks. 

204 

205 Parameters 

206 ---------- 

207 description : `str` 

208 A description of that this pipeline does. 

209 """ 

210 

211 def __init__(self, description: str): 

212 pipeline_dict = {"description": description, "tasks": {}} 

213 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

214 

215 @classmethod 

216 def fromFile(cls, filename: str) -> Pipeline: 

217 """Load a pipeline defined in a pipeline yaml file. 

218 

219 Parameters 

220 ---------- 

221 filename: `str` 

222 A path that points to a pipeline defined in yaml format. This 

223 filename may also supply additional labels to be used in 

224 subsetting the loaded Pipeline. These labels are separated from 

225 the path by a \\#, and may be specified as a comma separated 

226 list, or a range denoted as beginning..end. Beginning or end may 

227 be empty, in which case the range will be a half open interval. 

228 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

229 that range based selection is not well defined for pipelines that 

230 are not linear in nature, and correct behavior is not guaranteed, 

231 or may vary from run to run. 

232 

233 Returns 

234 ------- 

235 pipeline: `Pipeline` 

236 The pipeline loaded from specified location with appropriate (if 

237 any) subsetting 

238 

239 Notes 

240 ----- 

241 This method attempts to prune any contracts that contain labels which 

242 are not in the declared subset of labels. This pruning is done using a 

243 string based matching due to the nature of contracts and may prune more 

244 than it should. 

245 """ 

246 return cls.from_uri(filename) 

247 

248 @classmethod 

249 def from_uri(cls, uri: Union[str, ButlerURI]) -> Pipeline: 

250 """Load a pipeline defined in a pipeline yaml file at a location 

251 specified by a URI. 

252 

253 Parameters 

254 ---------- 

255 uri: `str` or `ButlerURI` 

256 If a string is supplied this should be a URI path that points to a 

257 pipeline defined in yaml format. This uri may also supply 

258 additional labels to be used in subsetting the loaded Pipeline. 

259 These labels are separated from the path by a \\#, and may be 

260 specified as a comma separated list, or a range denoted as 

261 beginning..end. Beginning or end may be empty, in which case the 

262 range will be a half open interval. Unlike python iteration 

263 bounds, end bounds are *INCLUDED*. Note that range based selection 

264 is not well defined for pipelines that are not linear in nature, 

265 and correct behavior is not guaranteed, or may vary from run to 

266 run. The same specifiers can be used with a ButlerURI object, by 

267 being the sole contents in the fragments attribute. 

268 

269 Returns 

270 ------- 

271 pipeline: `Pipeline` 

272 The pipeline loaded from specified location with appropriate (if 

273 any) subsetting 

274 

275 Notes 

276 ----- 

277 This method attempts to prune any contracts that contain labels which 

278 are not in the declared subset of labels. This pruning is done using a 

279 string based matching due to the nature of contracts and may prune more 

280 than it should. 

281 """ 

282 # Split up the uri and any labels that were supplied 

283 uri, label_specifier = cls._parse_file_specifier(uri) 

284 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

285 

286 # If there are labels supplied, only keep those 

287 if label_specifier is not None: 

288 pipeline = pipeline.subsetFromLabels(label_specifier) 

289 return pipeline 

290 

291 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

292 """Subset a pipeline to contain only labels specified in labelSpecifier 

293 

294 Parameters 

295 ---------- 

296 labelSpecifier : `labelSpecifier` 

297 Object containing labels that describes how to subset a pipeline. 

298 

299 Returns 

300 ------- 

301 pipeline : `Pipeline` 

302 A new pipeline object that is a subset of the old pipeline 

303 

304 Raises 

305 ------ 

306 ValueError 

307 Raised if there is an issue with specified labels 

308 

309 Notes 

310 ----- 

311 This method attempts to prune any contracts that contain labels which 

312 are not in the declared subset of labels. This pruning is done using a 

313 string based matching due to the nature of contracts and may prune more 

314 than it should. 

315 """ 

316 # Labels supplied as a set 

317 if labelSpecifier.labels: 

318 labelSet = labelSpecifier.labels 

319 # Labels supplied as a range, first create a list of all the labels 

320 # in the pipeline sorted according to task dependency. Then only 

321 # keep labels that lie between the supplied bounds 

322 else: 

323 # Create a copy of the pipeline to use when assessing the label 

324 # ordering. Use a dict for fast searching while preserving order. 

325 # Remove contracts so they do not fail in the expansion step. This 

326 # is needed because a user may only configure the tasks they intend 

327 # to run, which may cause some contracts to fail if they will later 

328 # be dropped 

329 pipeline = copy.deepcopy(self) 

330 pipeline._pipelineIR.contracts = [] 

331 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

332 

333 # Verify the bounds are in the labels 

334 if labelSpecifier.begin is not None: 

335 if labelSpecifier.begin not in labels: 

336 raise ValueError( 

337 f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

338 "pipeline definition" 

339 ) 

340 if labelSpecifier.end is not None: 

341 if labelSpecifier.end not in labels: 

342 raise ValueError( 

343 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

344 ) 

345 

346 labelSet = set() 

347 for label in labels: 

348 if labelSpecifier.begin is not None: 

349 if label != labelSpecifier.begin: 

350 continue 

351 else: 

352 labelSpecifier.begin = None 

353 labelSet.add(label) 

354 if labelSpecifier.end is not None and label == labelSpecifier.end: 

355 break 

356 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

357 

358 @staticmethod 

359 def _parse_file_specifier(uri: Union[str, ButlerURI]) -> Tuple[ButlerURI, Optional[LabelSpecifier]]: 

360 """Split appart a uri and any possible label subsets""" 

361 if isinstance(uri, str): 

362 # This is to support legacy pipelines during transition 

363 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

364 if num_replace: 

365 warnings.warn( 

366 f"The pipeline file {uri} seems to use the legacy : to separate " 

367 "labels, this is deprecated and will be removed after June 2021, please use " 

368 "# instead.", 

369 category=FutureWarning, 

370 ) 

371 if uri.count("#") > 1: 

372 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

373 uri = ButlerURI(uri) 

374 label_subset = uri.fragment or None 

375 

376 specifier: Optional[LabelSpecifier] 

377 if label_subset is not None: 

378 label_subset = urllib.parse.unquote(label_subset) 

379 args: Dict[str, Union[Set[str], str, None]] 

380 # labels supplied as a list 

381 if "," in label_subset: 

382 if ".." in label_subset: 

383 raise ValueError( 

384 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

385 ) 

386 args = {"labels": set(label_subset.split(","))} 

387 # labels supplied as a range 

388 elif ".." in label_subset: 

389 # Try to de-structure the labelSubset, this will fail if more 

390 # than one range is specified 

391 begin, end, *rest = label_subset.split("..") 

392 if rest: 

393 raise ValueError("Only one range can be specified when loading a pipeline") 

394 args = {"begin": begin if begin else None, "end": end if end else None} 

395 # Assume anything else is a single label 

396 else: 

397 args = {"labels": {label_subset}} 

398 

399 specifier = LabelSpecifier(**args) 

400 else: 

401 specifier = None 

402 

403 return uri, specifier 

404 

405 @classmethod 

406 def fromString(cls, pipeline_string: str) -> Pipeline: 

407 """Create a pipeline from string formatted as a pipeline document. 

408 

409 Parameters 

410 ---------- 

411 pipeline_string : `str` 

412 A string that is formatted according like a pipeline document 

413 

414 Returns 

415 ------- 

416 pipeline: `Pipeline` 

417 """ 

418 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

419 return pipeline 

420 

421 @classmethod 

422 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

423 """Create a pipeline from an already created `PipelineIR` object. 

424 

425 Parameters 

426 ---------- 

427 deserialized_pipeline: `PipelineIR` 

428 An already created pipeline intermediate representation object 

429 

430 Returns 

431 ------- 

432 pipeline: `Pipeline` 

433 """ 

434 pipeline = cls.__new__(cls) 

435 pipeline._pipelineIR = deserialized_pipeline 

436 return pipeline 

437 

438 @classmethod 

439 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline: 

440 """Create a new pipeline by copying an already existing `Pipeline`. 

441 

442 Parameters 

443 ---------- 

444 pipeline: `Pipeline` 

445 An already created pipeline intermediate representation object 

446 

447 Returns 

448 ------- 

449 pipeline: `Pipeline` 

450 """ 

451 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

452 

453 def __str__(self) -> str: 

454 # tasks need sorted each call because someone might have added or 

455 # removed task, and caching changes does not seem worth the small 

456 # overhead 

457 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

458 self._pipelineIR.reorder_tasks(labels) 

459 return str(self._pipelineIR) 

460 

461 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

462 """Add an instrument to the pipeline, or replace an instrument that is 

463 already defined. 

464 

465 Parameters 

466 ---------- 

467 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

468 Either a derived class object of a `lsst.daf.butler.instrument` or 

469 a string corresponding to a fully qualified 

470 `lsst.daf.butler.instrument` name. 

471 """ 

472 if isinstance(instrument, str): 

473 pass 

474 else: 

475 # TODO: assume that this is a subclass of Instrument, no type 

476 # checking 

477 instrument = f"{instrument.__module__}.{instrument.__qualname__}" 

478 self._pipelineIR.instrument = instrument 

479 

480 def getInstrument(self) -> Instrument: 

481 """Get the instrument from the pipeline. 

482 

483 Returns 

484 ------- 

485 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None 

486 A derived class object of a `lsst.daf.butler.instrument`, a string 

487 corresponding to a fully qualified `lsst.daf.butler.instrument` 

488 name, or None if the pipeline does not have an instrument. 

489 """ 

490 return self._pipelineIR.instrument 

491 

492 def addTask(self, task: Union[PipelineTask, str], label: str) -> None: 

493 """Add a new task to the pipeline, or replace a task that is already 

494 associated with the supplied label. 

495 

496 Parameters 

497 ---------- 

498 task: `PipelineTask` or `str` 

499 Either a derived class object of a `PipelineTask` or a string 

500 corresponding to a fully qualified `PipelineTask` name. 

501 label: `str` 

502 A label that is used to identify the `PipelineTask` being added 

503 """ 

504 if isinstance(task, str): 

505 taskName = task 

506 elif issubclass(task, PipelineTask): 

507 taskName = f"{task.__module__}.{task.__qualname__}" 

508 else: 

509 raise ValueError( 

510 "task must be either a child class of PipelineTask or a string containing" 

511 " a fully qualified name to one" 

512 ) 

513 if not label: 

514 # in some cases (with command line-generated pipeline) tasks can 

515 # be defined without label which is not acceptable, use task 

516 # _DefaultName in that case 

517 if isinstance(task, str): 

518 task = doImport(task) 

519 label = task._DefaultName 

520 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

521 

522 def removeTask(self, label: str) -> None: 

523 """Remove a task from the pipeline. 

524 

525 Parameters 

526 ---------- 

527 label : `str` 

528 The label used to identify the task that is to be removed 

529 

530 Raises 

531 ------ 

532 KeyError 

533 If no task with that label exists in the pipeline 

534 

535 """ 

536 self._pipelineIR.tasks.pop(label) 

537 

538 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

539 """Apply single config override. 

540 

541 Parameters 

542 ---------- 

543 label : `str` 

544 Label of the task. 

545 key: `str` 

546 Fully-qualified field name. 

547 value : object 

548 Value to be given to a field. 

549 """ 

550 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

551 

552 def addConfigFile(self, label: str, filename: str) -> None: 

553 """Add overrides from a specified file. 

554 

555 Parameters 

556 ---------- 

557 label : `str` 

558 The label used to identify the task associated with config to 

559 modify 

560 filename : `str` 

561 Path to the override file. 

562 """ 

563 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

564 

565 def addConfigPython(self, label: str, pythonString: str) -> None: 

566 """Add Overrides by running a snippet of python code against a config. 

567 

568 Parameters 

569 ---------- 

570 label : `str` 

571 The label used to identity the task associated with config to 

572 modify. 

573 pythonString: `str` 

574 A string which is valid python code to be executed. This is done 

575 with config as the only local accessible value. 

576 """ 

577 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

578 

579 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

580 if label == "parameters": 

581 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

582 raise ValueError("Cannot override parameters that are not defined in pipeline") 

583 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

584 if newConfig.file: 

585 raise ValueError("Setting parameters section with config file is not supported") 

586 if newConfig.python: 

587 raise ValueError("Setting parameters section using python block in unsupported") 

588 return 

589 if label not in self._pipelineIR.tasks: 

590 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

591 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

592 

593 def toFile(self, filename: str) -> None: 

594 self._pipelineIR.to_file(filename) 

595 

596 def write_to_uri(self, uri: Union[str, ButlerURI]) -> None: 

597 # tasks need sorted each call because someone might have added or 

598 # removed task, and caching changes does not seem worth the small 

599 # overhead 

600 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

601 self._pipelineIR.reorder_tasks(labels) 

602 self._pipelineIR.write_to_uri(uri) 

603 

604 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

605 """Returns a generator of TaskDefs which can be used to create quantum 

606 graphs. 

607 

608 Returns 

609 ------- 

610 generator : generator of `TaskDef` 

611 The generator returned will be the sorted iterator of tasks which 

612 are to be used in constructing a quantum graph. 

613 

614 Raises 

615 ------ 

616 NotImplementedError 

617 If a dataId is supplied in a config block. This is in place for 

618 future use 

619 """ 

620 yield from self._toExpandedPipelineImpl() 

621 

622 def _toExpandedPipelineImpl(self, checkContracts=True) -> Iterable[TaskDef]: 

623 taskDefs = [] 

624 for label in self._pipelineIR.tasks: 

625 taskDefs.append(self._buildTaskDef(label)) 

626 

627 # lets evaluate the contracts 

628 if self._pipelineIR.contracts is not None: 

629 label_to_config = {x.label: x.config for x in taskDefs} 

630 for contract in self._pipelineIR.contracts: 

631 # execute this in its own line so it can raise a good error 

632 # message if there was problems with the eval 

633 success = eval(contract.contract, None, label_to_config) 

634 if not success: 

635 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

636 raise pipelineIR.ContractError( 

637 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

638 ) 

639 

640 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

641 yield from pipeTools.orderPipeline(taskDefs) 

642 

643 def _buildTaskDef(self, label: str) -> TaskDef: 

644 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

645 raise NameError(f"Label {label} does not appear in this pipeline") 

646 taskClass = doImport(taskIR.klass) 

647 taskName = taskClass.__qualname__ 

648 config = taskClass.ConfigClass() 

649 overrides = ConfigOverrides() 

650 if self._pipelineIR.instrument is not None: 

651 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

652 if taskIR.config is not None: 

653 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config): 

654 if configIR.dataId is not None: 

655 raise NotImplementedError( 

656 "Specializing a config on a partial data id is not yet " 

657 "supported in Pipeline definition" 

658 ) 

659 # only apply override if it applies to everything 

660 if configIR.dataId is None: 

661 if configIR.file: 

662 for configFile in configIR.file: 

663 overrides.addFileOverride(os.path.expandvars(configFile)) 

664 if configIR.python is not None: 

665 overrides.addPythonOverride(configIR.python) 

666 for key, value in configIR.rest.items(): 

667 overrides.addValueOverride(key, value) 

668 overrides.applyTo(config) 

669 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

670 

671 def __iter__(self) -> Generator[TaskDef, None, None]: 

672 return self.toExpandedPipeline() 

673 

674 def __getitem__(self, item: str) -> TaskDef: 

675 return self._buildTaskDef(item) 

676 

677 def __len__(self): 

678 return len(self._pipelineIR.tasks) 

679 

680 def __eq__(self, other: object): 

681 if not isinstance(other, Pipeline): 

682 return False 

683 return self._pipelineIR == other._pipelineIR 

684 

685 

686@dataclass(frozen=True) 

687class TaskDatasetTypes: 

688 """An immutable struct that extracts and classifies the dataset types used 

689 by a `PipelineTask` 

690 """ 

691 

692 initInputs: NamedValueSet[DatasetType] 

693 """Dataset types that are needed as inputs in order to construct this Task. 

694 

695 Task-level `initInputs` may be classified as either 

696 `~PipelineDatasetTypes.initInputs` or 

697 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

698 """ 

699 

700 initOutputs: NamedValueSet[DatasetType] 

701 """Dataset types that may be written after constructing this Task. 

702 

703 Task-level `initOutputs` may be classified as either 

704 `~PipelineDatasetTypes.initOutputs` or 

705 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

706 """ 

707 

708 inputs: NamedValueSet[DatasetType] 

709 """Dataset types that are regular inputs to this Task. 

710 

711 If an input dataset needed for a Quantum cannot be found in the input 

712 collection(s) or produced by another Task in the Pipeline, that Quantum 

713 (and all dependent Quanta) will not be produced. 

714 

715 Task-level `inputs` may be classified as either 

716 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

717 at the Pipeline level. 

718 """ 

719 

720 prerequisites: NamedValueSet[DatasetType] 

721 """Dataset types that are prerequisite inputs to this Task. 

722 

723 Prerequisite inputs must exist in the input collection(s) before the 

724 pipeline is run, but do not constrain the graph - if a prerequisite is 

725 missing for a Quantum, `PrerequisiteMissingError` is raised. 

726 

727 Prerequisite inputs are not resolved until the second stage of 

728 QuantumGraph generation. 

729 """ 

730 

731 outputs: NamedValueSet[DatasetType] 

732 """Dataset types that are produced by this Task. 

733 

734 Task-level `outputs` may be classified as either 

735 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

736 at the Pipeline level. 

737 """ 

738 

739 @classmethod 

740 def fromTaskDef( 

741 cls, 

742 taskDef: TaskDef, 

743 *, 

744 registry: Registry, 

745 include_configs: bool = True, 

746 storage_class_mapping: Optional[Mapping[str, str]] = None, 

747 ) -> TaskDatasetTypes: 

748 """Extract and classify the dataset types from a single `PipelineTask`. 

749 

750 Parameters 

751 ---------- 

752 taskDef: `TaskDef` 

753 An instance of a `TaskDef` class for a particular `PipelineTask`. 

754 registry: `Registry` 

755 Registry used to construct normalized `DatasetType` objects and 

756 retrieve those that are incomplete. 

757 include_configs : `bool`, optional 

758 If `True` (default) include config dataset types as 

759 ``initOutputs``. 

760 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

761 If a taskdef contains a component dataset type that is unknown 

762 to the registry, its parent StorageClass will be looked up in this 

763 mapping if it is supplied. If the mapping does not contain the 

764 composite dataset type, or the mapping is not supplied an exception 

765 will be raised. 

766 

767 Returns 

768 ------- 

769 types: `TaskDatasetTypes` 

770 The dataset types used by this task. 

771 

772 Raises 

773 ------ 

774 ValueError 

775 Raised if dataset type connection definition differs from 

776 registry definition. 

777 LookupError 

778 Raised if component parent StorageClass could not be determined 

779 and storage_class_mapping does not contain the composite type, or 

780 is set to None. 

781 """ 

782 

783 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]: 

784 """Constructs a set of true `DatasetType` objects 

785 

786 Parameters 

787 ---------- 

788 connectionType : `str` 

789 Name of the connection type to produce a set for, corresponds 

790 to an attribute of type `list` on the connection class instance 

791 freeze : `bool`, optional 

792 If `True`, call `NamedValueSet.freeze` on the object returned. 

793 

794 Returns 

795 ------- 

796 datasetTypes : `NamedValueSet` 

797 A set of all datasetTypes which correspond to the input 

798 connection type specified in the connection class of this 

799 `PipelineTask` 

800 

801 Raises 

802 ------ 

803 ValueError 

804 Raised if dataset type connection definition differs from 

805 registry definition. 

806 LookupError 

807 Raised if component parent StorageClass could not be determined 

808 and storage_class_mapping does not contain the composite type, 

809 or is set to None. 

810 

811 Notes 

812 ----- 

813 This function is a closure over the variables ``registry`` and 

814 ``taskDef``, and ``storage_class_mapping``. 

815 """ 

816 datasetTypes = NamedValueSet() 

817 for c in iterConnections(taskDef.connections, connectionType): 

818 dimensions = set(getattr(c, "dimensions", set())) 

819 if "skypix" in dimensions: 

820 try: 

821 datasetType = registry.getDatasetType(c.name) 

822 except LookupError as err: 

823 raise LookupError( 

824 f"DatasetType '{c.name}' referenced by " 

825 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

826 f"placeholder, but does not already exist in the registry. " 

827 f"Note that reference catalog names are now used as the dataset " 

828 f"type name instead of 'ref_cat'." 

829 ) from err 

830 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

831 rest2 = set( 

832 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

833 ) 

834 if rest1 != rest2: 

835 raise ValueError( 

836 f"Non-skypix dimensions for dataset type {c.name} declared in " 

837 f"connections ({rest1}) are inconsistent with those in " 

838 f"registry's version of this dataset ({rest2})." 

839 ) 

840 else: 

841 # Component dataset types are not explicitly in the 

842 # registry. This complicates consistency checks with 

843 # registry and requires we work out the composite storage 

844 # class. 

845 registryDatasetType = None 

846 try: 

847 registryDatasetType = registry.getDatasetType(c.name) 

848 except KeyError: 

849 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

850 if componentName: 

851 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

852 raise LookupError( 

853 "Component parent class cannot be determined, and " 

854 "composite name was not in storage class mapping, or no " 

855 "storage_class_mapping was supplied" 

856 ) 

857 else: 

858 parentStorageClass = storage_class_mapping[compositeName] 

859 else: 

860 parentStorageClass = None 

861 datasetType = c.makeDatasetType( 

862 registry.dimensions, parentStorageClass=parentStorageClass 

863 ) 

864 registryDatasetType = datasetType 

865 else: 

866 datasetType = c.makeDatasetType( 

867 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

868 ) 

869 

870 if registryDatasetType and datasetType != registryDatasetType: 

871 try: 

872 # Explicitly check for storage class just to make 

873 # more specific message. 

874 _ = datasetType.storageClass 

875 except KeyError: 

876 raise ValueError( 

877 "Storage class does not exist for supplied dataset type " 

878 f"{datasetType} for {taskDef.label}." 

879 ) from None 

880 raise ValueError( 

881 f"Supplied dataset type ({datasetType}) inconsistent with " 

882 f"registry definition ({registryDatasetType}) " 

883 f"for {taskDef.label}." 

884 ) 

885 datasetTypes.add(datasetType) 

886 if freeze: 

887 datasetTypes.freeze() 

888 return datasetTypes 

889 

890 # optionally add initOutput dataset for config 

891 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False) 

892 if include_configs: 

893 initOutputs.add( 

894 DatasetType( 

895 taskDef.configDatasetName, 

896 registry.dimensions.empty, 

897 storageClass="Config", 

898 ) 

899 ) 

900 initOutputs.freeze() 

901 

902 # optionally add output dataset for metadata 

903 outputs = makeDatasetTypesSet("outputs", freeze=False) 

904 if taskDef.metadataDatasetName is not None: 

905 # Metadata is supposed to be of the PropertySet type, its 

906 # dimensions correspond to a task quantum 

907 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

908 if _TASK_METADATA_TYPE is TaskMetadata: 

909 storageClass = "TaskMetadata" 

910 else: 

911 storageClass = "PropertySet" 

912 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)} 

913 if taskDef.logOutputDatasetName is not None: 

914 # Log output dimensions correspond to a task quantum. 

915 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

916 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")} 

917 

918 outputs.freeze() 

919 

920 return cls( 

921 initInputs=makeDatasetTypesSet("initInputs"), 

922 initOutputs=initOutputs, 

923 inputs=makeDatasetTypesSet("inputs"), 

924 prerequisites=makeDatasetTypesSet("prerequisiteInputs"), 

925 outputs=outputs, 

926 ) 

927 

928 

929@dataclass(frozen=True) 

930class PipelineDatasetTypes: 

931 """An immutable struct that classifies the dataset types used in a 

932 `Pipeline`. 

933 """ 

934 

935 packagesDatasetName: ClassVar[str] = "packages" 

936 """Name of a dataset type used to save package versions. 

937 """ 

938 

939 initInputs: NamedValueSet[DatasetType] 

940 """Dataset types that are needed as inputs in order to construct the Tasks 

941 in this Pipeline. 

942 

943 This does not include dataset types that are produced when constructing 

944 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

945 """ 

946 

947 initOutputs: NamedValueSet[DatasetType] 

948 """Dataset types that may be written after constructing the Tasks in this 

949 Pipeline. 

950 

951 This does not include dataset types that are also used as inputs when 

952 constructing other Tasks in the Pipeline (these are classified as 

953 `initIntermediates`). 

954 """ 

955 

956 initIntermediates: NamedValueSet[DatasetType] 

957 """Dataset types that are both used when constructing one or more Tasks 

958 in the Pipeline and produced as a side-effect of constructing another 

959 Task in the Pipeline. 

960 """ 

961 

962 inputs: NamedValueSet[DatasetType] 

963 """Dataset types that are regular inputs for the full pipeline. 

964 

965 If an input dataset needed for a Quantum cannot be found in the input 

966 collection(s), that Quantum (and all dependent Quanta) will not be 

967 produced. 

968 """ 

969 

970 prerequisites: NamedValueSet[DatasetType] 

971 """Dataset types that are prerequisite inputs for the full Pipeline. 

972 

973 Prerequisite inputs must exist in the input collection(s) before the 

974 pipeline is run, but do not constrain the graph - if a prerequisite is 

975 missing for a Quantum, `PrerequisiteMissingError` is raised. 

976 

977 Prerequisite inputs are not resolved until the second stage of 

978 QuantumGraph generation. 

979 """ 

980 

981 intermediates: NamedValueSet[DatasetType] 

982 """Dataset types that are output by one Task in the Pipeline and consumed 

983 as inputs by one or more other Tasks in the Pipeline. 

984 """ 

985 

986 outputs: NamedValueSet[DatasetType] 

987 """Dataset types that are output by a Task in the Pipeline and not consumed 

988 by any other Task in the Pipeline. 

989 """ 

990 

991 byTask: Mapping[str, TaskDatasetTypes] 

992 """Per-Task dataset types, keyed by label in the `Pipeline`. 

993 

994 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

995 neither has been modified since the dataset types were extracted, of 

996 course). 

997 """ 

998 

999 @classmethod 

1000 def fromPipeline( 

1001 cls, 

1002 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1003 *, 

1004 registry: Registry, 

1005 include_configs: bool = True, 

1006 include_packages: bool = True, 

1007 ) -> PipelineDatasetTypes: 

1008 """Extract and classify the dataset types from all tasks in a 

1009 `Pipeline`. 

1010 

1011 Parameters 

1012 ---------- 

1013 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1014 A collection of tasks that can be run together. 

1015 registry: `Registry` 

1016 Registry used to construct normalized `DatasetType` objects and 

1017 retrieve those that are incomplete. 

1018 include_configs : `bool`, optional 

1019 If `True` (default) include config dataset types as 

1020 ``initOutputs``. 

1021 include_packages : `bool`, optional 

1022 If `True` (default) include the dataset type for software package 

1023 versions in ``initOutputs``. 

1024 

1025 Returns 

1026 ------- 

1027 types: `PipelineDatasetTypes` 

1028 The dataset types used by this `Pipeline`. 

1029 

1030 Raises 

1031 ------ 

1032 ValueError 

1033 Raised if Tasks are inconsistent about which datasets are marked 

1034 prerequisite. This indicates that the Tasks cannot be run as part 

1035 of the same `Pipeline`. 

1036 """ 

1037 allInputs = NamedValueSet() 

1038 allOutputs = NamedValueSet() 

1039 allInitInputs = NamedValueSet() 

1040 allInitOutputs = NamedValueSet() 

1041 prerequisites = NamedValueSet() 

1042 byTask = dict() 

1043 if include_packages: 

1044 allInitOutputs.add( 

1045 DatasetType( 

1046 cls.packagesDatasetName, 

1047 registry.dimensions.empty, 

1048 storageClass="Packages", 

1049 ) 

1050 ) 

1051 # create a list of TaskDefs in case the input is a generator 

1052 pipeline = list(pipeline) 

1053 

1054 # collect all the output dataset types 

1055 typeStorageclassMap: Dict[str, str] = {} 

1056 for taskDef in pipeline: 

1057 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1058 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1059 

1060 for taskDef in pipeline: 

1061 thisTask = TaskDatasetTypes.fromTaskDef( 

1062 taskDef, 

1063 registry=registry, 

1064 include_configs=include_configs, 

1065 storage_class_mapping=typeStorageclassMap, 

1066 ) 

1067 allInitInputs |= thisTask.initInputs 

1068 allInitOutputs |= thisTask.initOutputs 

1069 allInputs |= thisTask.inputs 

1070 prerequisites |= thisTask.prerequisites 

1071 allOutputs |= thisTask.outputs 

1072 byTask[taskDef.label] = thisTask 

1073 if not prerequisites.isdisjoint(allInputs): 

1074 raise ValueError( 

1075 "{} marked as both prerequisites and regular inputs".format( 

1076 {dt.name for dt in allInputs & prerequisites} 

1077 ) 

1078 ) 

1079 if not prerequisites.isdisjoint(allOutputs): 

1080 raise ValueError( 

1081 "{} marked as both prerequisites and outputs".format( 

1082 {dt.name for dt in allOutputs & prerequisites} 

1083 ) 

1084 ) 

1085 # Make sure that components which are marked as inputs get treated as 

1086 # intermediates if there is an output which produces the composite 

1087 # containing the component 

1088 intermediateComponents = NamedValueSet() 

1089 intermediateComposites = NamedValueSet() 

1090 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1091 for dsType in allInputs: 

1092 # get the name of a possible component 

1093 name, component = dsType.nameAndComponent() 

1094 # if there is a component name, that means this is a component 

1095 # DatasetType, if there is an output which produces the parent of 

1096 # this component, treat this input as an intermediate 

1097 if component is not None: 

1098 # This needs to be in this if block, because someone might have 

1099 # a composite that is a pure input from existing data 

1100 if name in outputNameMapping: 

1101 intermediateComponents.add(dsType) 

1102 intermediateComposites.add(outputNameMapping[name]) 

1103 

1104 def checkConsistency(a: NamedValueSet, b: NamedValueSet): 

1105 common = a.names & b.names 

1106 for name in common: 

1107 if a[name] != b[name]: 

1108 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1109 

1110 checkConsistency(allInitInputs, allInitOutputs) 

1111 checkConsistency(allInputs, allOutputs) 

1112 checkConsistency(allInputs, intermediateComposites) 

1113 checkConsistency(allOutputs, intermediateComposites) 

1114 

1115 def frozen(s: NamedValueSet) -> NamedValueSet: 

1116 s.freeze() 

1117 return s 

1118 

1119 return cls( 

1120 initInputs=frozen(allInitInputs - allInitOutputs), 

1121 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1122 initOutputs=frozen(allInitOutputs - allInitInputs), 

1123 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1124 intermediates=frozen(allInputs & allOutputs | intermediateComponents), 

1125 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1126 prerequisites=frozen(prerequisites), 

1127 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1128 ) 

1129 

1130 @classmethod 

1131 def initOutputNames( 

1132 cls, 

1133 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1134 *, 

1135 include_configs: bool = True, 

1136 include_packages: bool = True, 

1137 ) -> Iterator[str]: 

1138 """Return the names of dataset types ot task initOutputs, Configs, 

1139 and package versions for a pipeline. 

1140 

1141 Parameters 

1142 ---------- 

1143 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1144 A `Pipeline` instance or collection of `TaskDef` instances. 

1145 include_configs : `bool`, optional 

1146 If `True` (default) include config dataset types. 

1147 include_packages : `bool`, optional 

1148 If `True` (default) include the dataset type for package versions. 

1149 

1150 Yields 

1151 ------ 

1152 datasetTypeName : `str` 

1153 Name of the dataset type. 

1154 """ 

1155 if include_packages: 

1156 # Package versions dataset type 

1157 yield cls.packagesDatasetName 

1158 

1159 if isinstance(pipeline, Pipeline): 

1160 pipeline = pipeline.toExpandedPipeline() 

1161 

1162 for taskDef in pipeline: 

1163 

1164 # all task InitOutputs 

1165 for name in taskDef.connections.initOutputs: 

1166 attribute = getattr(taskDef.connections, name) 

1167 yield attribute.name 

1168 

1169 # config dataset name 

1170 if include_configs: 

1171 yield taskDef.configDatasetName