Coverage for python/lsst/pipe/base/pipeline.py: 19%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

384 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31from dataclasses import dataclass 

32import logging 

33from types import MappingProxyType 

34from typing import (ClassVar, Dict, Iterable, Iterator, Mapping, Set, Union, 

35 Generator, TYPE_CHECKING, Optional, Tuple) 

36 

37import copy 

38import re 

39import os 

40import urllib.parse 

41import warnings 

42 

43# ----------------------------- 

44# Imports for other modules -- 

45from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension, ButlerURI 

46from lsst.utils import doImport 

47from .configOverrides import ConfigOverrides 

48from .connections import iterConnections 

49from .pipelineTask import PipelineTask 

50from .task import _TASK_METADATA_TYPE 

51from ._task_metadata import TaskMetadata 

52 

53from . import pipelineIR 

54from . import pipeTools 

55 

56if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true

57 from lsst.obs.base import Instrument 

58 

59# ---------------------------------- 

60# Local non-exported definitions -- 

61# ---------------------------------- 

62 

63_LOG = logging.getLogger(__name__) 

64 

65# ------------------------ 

66# Exported definitions -- 

67# ------------------------ 

68 

69 

70@dataclass 

71class LabelSpecifier: 

72 """A structure to specify a subset of labels to load 

73 

74 This structure may contain a set of labels to be used in subsetting a 

75 pipeline, or a beginning and end point. Beginning or end may be empty, 

76 in which case the range will be a half open interval. Unlike python 

77 iteration bounds, end bounds are *INCLUDED*. Note that range based 

78 selection is not well defined for pipelines that are not linear in nature, 

79 and correct behavior is not guaranteed, or may vary from run to run. 

80 """ 

81 labels: Optional[Set[str]] = None 

82 begin: Optional[str] = None 

83 end: Optional[str] = None 

84 

85 def __post_init__(self): 

86 if self.labels is not None and (self.begin or self.end): 

87 raise ValueError("This struct can only be initialized with a labels set or " 

88 "a begin (and/or) end specifier") 

89 

90 

91class TaskDef: 

92 """TaskDef is a collection of information about task needed by Pipeline. 

93 

94 The information includes task name, configuration object and optional 

95 task class. This class is just a collection of attributes and it exposes 

96 all of them so that attributes could potentially be modified in place 

97 (e.g. if configuration needs extra overrides). 

98 

99 Attributes 

100 ---------- 

101 taskName : `str`, optional 

102 `PipelineTask` class name, currently it is not specified whether this 

103 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 

104 Framework should be prepared to handle all cases. If not provided, 

105 ``taskClass`` must be, and ``taskClass.__name__`` is used. 

106 config : `lsst.pex.config.Config`, optional 

107 Instance of the configuration class corresponding to this task class, 

108 usually with all overrides applied. This config will be frozen. If 

109 not provided, ``taskClass`` must be provided and 

110 ``taskClass.ConfigClass()`` will be used. 

111 taskClass : `type`, optional 

112 `PipelineTask` class object, can be ``None``. If ``None`` then 

113 framework will have to locate and load class. 

114 label : `str`, optional 

115 Task label, usually a short string unique in a pipeline. If not 

116 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

117 be used. 

118 """ 

119 def __init__(self, taskName=None, config=None, taskClass=None, label=None): 

120 if taskName is None: 

121 if taskClass is None: 

122 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

123 taskName = taskClass.__name__ 

124 if config is None: 

125 if taskClass is None: 

126 raise ValueError("`taskClass` must be provided if `config` is not.") 

127 config = taskClass.ConfigClass() 

128 if label is None: 

129 if taskClass is None: 

130 raise ValueError("`taskClass` must be provided if `label` is not.") 

131 label = taskClass._DefaultName 

132 self.taskName = taskName 

133 config.freeze() 

134 self.config = config 

135 self.taskClass = taskClass 

136 self.label = label 

137 self.connections = config.connections.ConnectionsClass(config=config) 

138 

139 @property 

140 def configDatasetName(self) -> str: 

141 """Name of a dataset type for configuration of this task (`str`) 

142 """ 

143 return self.label + "_config" 

144 

145 @property 

146 def metadataDatasetName(self) -> Optional[str]: 

147 """Name of a dataset type for metadata of this task, `None` if 

148 metadata is not to be saved (`str`) 

149 """ 

150 if self.config.saveMetadata: 

151 return self.label + "_metadata" 

152 else: 

153 return None 

154 

155 @property 

156 def logOutputDatasetName(self) -> Optional[str]: 

157 """Name of a dataset type for log output from this task, `None` if 

158 logs are not to be saved (`str`) 

159 """ 

160 if self.config.saveLogOutput: 

161 return self.label + "_log" 

162 else: 

163 return None 

164 

165 def __str__(self): 

166 rep = "TaskDef(" + self.taskName 

167 if self.label: 

168 rep += ", label=" + self.label 

169 rep += ")" 

170 return rep 

171 

172 def __eq__(self, other: object) -> bool: 

173 if not isinstance(other, TaskDef): 

174 return False 

175 # This does not consider equality of configs when determining equality 

176 # as config equality is a difficult thing to define. Should be updated 

177 # after DM-27847 

178 return self.taskClass == other.taskClass and self.label == other.label 

179 

180 def __hash__(self): 

181 return hash((self.taskClass, self.label)) 

182 

183 

184class Pipeline: 

185 """A `Pipeline` is a representation of a series of tasks to run, and the 

186 configuration for those tasks. 

187 

188 Parameters 

189 ---------- 

190 description : `str` 

191 A description of that this pipeline does. 

192 """ 

193 def __init__(self, description: str): 

194 pipeline_dict = {"description": description, "tasks": {}} 

195 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

196 

197 @classmethod 

198 def fromFile(cls, filename: str) -> Pipeline: 

199 """Load a pipeline defined in a pipeline yaml file. 

200 

201 Parameters 

202 ---------- 

203 filename: `str` 

204 A path that points to a pipeline defined in yaml format. This 

205 filename may also supply additional labels to be used in 

206 subsetting the loaded Pipeline. These labels are separated from 

207 the path by a \\#, and may be specified as a comma separated 

208 list, or a range denoted as beginning..end. Beginning or end may 

209 be empty, in which case the range will be a half open interval. 

210 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

211 that range based selection is not well defined for pipelines that 

212 are not linear in nature, and correct behavior is not guaranteed, 

213 or may vary from run to run. 

214 

215 Returns 

216 ------- 

217 pipeline: `Pipeline` 

218 The pipeline loaded from specified location with appropriate (if 

219 any) subsetting 

220 

221 Notes 

222 ----- 

223 This method attempts to prune any contracts that contain labels which 

224 are not in the declared subset of labels. This pruning is done using a 

225 string based matching due to the nature of contracts and may prune more 

226 than it should. 

227 """ 

228 return cls.from_uri(filename) 

229 

230 @classmethod 

231 def from_uri(cls, uri: Union[str, ButlerURI]) -> Pipeline: 

232 """Load a pipeline defined in a pipeline yaml file at a location 

233 specified by a URI. 

234 

235 Parameters 

236 ---------- 

237 uri: `str` or `ButlerURI` 

238 If a string is supplied this should be a URI path that points to a 

239 pipeline defined in yaml format. This uri may also supply 

240 additional labels to be used in subsetting the loaded Pipeline. 

241 These labels are separated from the path by a \\#, and may be 

242 specified as a comma separated list, or a range denoted as 

243 beginning..end. Beginning or end may be empty, in which case the 

244 range will be a half open interval. Unlike python iteration 

245 bounds, end bounds are *INCLUDED*. Note that range based selection 

246 is not well defined for pipelines that are not linear in nature, 

247 and correct behavior is not guaranteed, or may vary from run to 

248 run. The same specifiers can be used with a ButlerURI object, by 

249 being the sole contents in the fragments attribute. 

250 

251 Returns 

252 ------- 

253 pipeline: `Pipeline` 

254 The pipeline loaded from specified location with appropriate (if 

255 any) subsetting 

256 

257 Notes 

258 ----- 

259 This method attempts to prune any contracts that contain labels which 

260 are not in the declared subset of labels. This pruning is done using a 

261 string based matching due to the nature of contracts and may prune more 

262 than it should. 

263 """ 

264 # Split up the uri and any labels that were supplied 

265 uri, label_specifier = cls._parse_file_specifier(uri) 

266 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

267 

268 # If there are labels supplied, only keep those 

269 if label_specifier is not None: 

270 pipeline = pipeline.subsetFromLabels(label_specifier) 

271 return pipeline 

272 

273 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

274 """Subset a pipeline to contain only labels specified in labelSpecifier 

275 

276 Parameters 

277 ---------- 

278 labelSpecifier : `labelSpecifier` 

279 Object containing labels that describes how to subset a pipeline. 

280 

281 Returns 

282 ------- 

283 pipeline : `Pipeline` 

284 A new pipeline object that is a subset of the old pipeline 

285 

286 Raises 

287 ------ 

288 ValueError 

289 Raised if there is an issue with specified labels 

290 

291 Notes 

292 ----- 

293 This method attempts to prune any contracts that contain labels which 

294 are not in the declared subset of labels. This pruning is done using a 

295 string based matching due to the nature of contracts and may prune more 

296 than it should. 

297 """ 

298 # Labels supplied as a set 

299 if labelSpecifier.labels: 

300 labelSet = labelSpecifier.labels 

301 # Labels supplied as a range, first create a list of all the labels 

302 # in the pipeline sorted according to task dependency. Then only 

303 # keep labels that lie between the supplied bounds 

304 else: 

305 # Create a copy of the pipeline to use when assessing the label 

306 # ordering. Use a dict for fast searching while preserving order. 

307 # Remove contracts so they do not fail in the expansion step. This 

308 # is needed because a user may only configure the tasks they intend 

309 # to run, which may cause some contracts to fail if they will later 

310 # be dropped 

311 pipeline = copy.deepcopy(self) 

312 pipeline._pipelineIR.contracts = [] 

313 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

314 

315 # Verify the bounds are in the labels 

316 if labelSpecifier.begin is not None: 

317 if labelSpecifier.begin not in labels: 

318 raise ValueError(f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

319 "pipeline definition") 

320 if labelSpecifier.end is not None: 

321 if labelSpecifier.end not in labels: 

322 raise ValueError(f"End of range subset, {labelSpecifier.end}, not found in pipeline " 

323 "definition") 

324 

325 labelSet = set() 

326 for label in labels: 

327 if labelSpecifier.begin is not None: 

328 if label != labelSpecifier.begin: 

329 continue 

330 else: 

331 labelSpecifier.begin = None 

332 labelSet.add(label) 

333 if labelSpecifier.end is not None and label == labelSpecifier.end: 

334 break 

335 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

336 

337 @staticmethod 

338 def _parse_file_specifier(uri: Union[str, ButlerURI] 

339 ) -> Tuple[ButlerURI, Optional[LabelSpecifier]]: 

340 """Split appart a uri and any possible label subsets 

341 """ 

342 if isinstance(uri, str): 

343 # This is to support legacy pipelines during transition 

344 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

345 if num_replace: 

346 warnings.warn(f"The pipeline file {uri} seems to use the legacy : to separate " 

347 "labels, this is deprecated and will be removed after June 2021, please use " 

348 "# instead.", 

349 category=FutureWarning) 

350 if uri.count("#") > 1: 

351 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

352 uri = ButlerURI(uri) 

353 label_subset = uri.fragment or None 

354 

355 specifier: Optional[LabelSpecifier] 

356 if label_subset is not None: 

357 label_subset = urllib.parse.unquote(label_subset) 

358 args: Dict[str, Union[Set[str], str, None]] 

359 # labels supplied as a list 

360 if ',' in label_subset: 

361 if '..' in label_subset: 

362 raise ValueError("Can only specify a list of labels or a range" 

363 "when loading a Pipline not both") 

364 args = {"labels": set(label_subset.split(","))} 

365 # labels supplied as a range 

366 elif '..' in label_subset: 

367 # Try to de-structure the labelSubset, this will fail if more 

368 # than one range is specified 

369 begin, end, *rest = label_subset.split("..") 

370 if rest: 

371 raise ValueError("Only one range can be specified when loading a pipeline") 

372 args = {"begin": begin if begin else None, "end": end if end else None} 

373 # Assume anything else is a single label 

374 else: 

375 args = {"labels": {label_subset}} 

376 

377 specifier = LabelSpecifier(**args) 

378 else: 

379 specifier = None 

380 

381 return uri, specifier 

382 

383 @classmethod 

384 def fromString(cls, pipeline_string: str) -> Pipeline: 

385 """Create a pipeline from string formatted as a pipeline document. 

386 

387 Parameters 

388 ---------- 

389 pipeline_string : `str` 

390 A string that is formatted according like a pipeline document 

391 

392 Returns 

393 ------- 

394 pipeline: `Pipeline` 

395 """ 

396 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

397 return pipeline 

398 

399 @classmethod 

400 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

401 """Create a pipeline from an already created `PipelineIR` object. 

402 

403 Parameters 

404 ---------- 

405 deserialized_pipeline: `PipelineIR` 

406 An already created pipeline intermediate representation object 

407 

408 Returns 

409 ------- 

410 pipeline: `Pipeline` 

411 """ 

412 pipeline = cls.__new__(cls) 

413 pipeline._pipelineIR = deserialized_pipeline 

414 return pipeline 

415 

416 @classmethod 

417 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline: 

418 """Create a new pipeline by copying an already existing `Pipeline`. 

419 

420 Parameters 

421 ---------- 

422 pipeline: `Pipeline` 

423 An already created pipeline intermediate representation object 

424 

425 Returns 

426 ------- 

427 pipeline: `Pipeline` 

428 """ 

429 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

430 

431 def __str__(self) -> str: 

432 # tasks need sorted each call because someone might have added or 

433 # removed task, and caching changes does not seem worth the small 

434 # overhead 

435 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

436 self._pipelineIR.reorder_tasks(labels) 

437 return str(self._pipelineIR) 

438 

439 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

440 """Add an instrument to the pipeline, or replace an instrument that is 

441 already defined. 

442 

443 Parameters 

444 ---------- 

445 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

446 Either a derived class object of a `lsst.daf.butler.instrument` or 

447 a string corresponding to a fully qualified 

448 `lsst.daf.butler.instrument` name. 

449 """ 

450 if isinstance(instrument, str): 

451 pass 

452 else: 

453 # TODO: assume that this is a subclass of Instrument, no type 

454 # checking 

455 instrument = f"{instrument.__module__}.{instrument.__qualname__}" 

456 self._pipelineIR.instrument = instrument 

457 

458 def getInstrument(self) -> Instrument: 

459 """Get the instrument from the pipeline. 

460 

461 Returns 

462 ------- 

463 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None 

464 A derived class object of a `lsst.daf.butler.instrument`, a string 

465 corresponding to a fully qualified `lsst.daf.butler.instrument` 

466 name, or None if the pipeline does not have an instrument. 

467 """ 

468 return self._pipelineIR.instrument 

469 

470 def addTask(self, task: Union[PipelineTask, str], label: str) -> None: 

471 """Add a new task to the pipeline, or replace a task that is already 

472 associated with the supplied label. 

473 

474 Parameters 

475 ---------- 

476 task: `PipelineTask` or `str` 

477 Either a derived class object of a `PipelineTask` or a string 

478 corresponding to a fully qualified `PipelineTask` name. 

479 label: `str` 

480 A label that is used to identify the `PipelineTask` being added 

481 """ 

482 if isinstance(task, str): 

483 taskName = task 

484 elif issubclass(task, PipelineTask): 

485 taskName = f"{task.__module__}.{task.__qualname__}" 

486 else: 

487 raise ValueError("task must be either a child class of PipelineTask or a string containing" 

488 " a fully qualified name to one") 

489 if not label: 

490 # in some cases (with command line-generated pipeline) tasks can 

491 # be defined without label which is not acceptable, use task 

492 # _DefaultName in that case 

493 if isinstance(task, str): 

494 task = doImport(task) 

495 label = task._DefaultName 

496 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

497 

498 def removeTask(self, label: str) -> None: 

499 """Remove a task from the pipeline. 

500 

501 Parameters 

502 ---------- 

503 label : `str` 

504 The label used to identify the task that is to be removed 

505 

506 Raises 

507 ------ 

508 KeyError 

509 If no task with that label exists in the pipeline 

510 

511 """ 

512 self._pipelineIR.tasks.pop(label) 

513 

514 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

515 """Apply single config override. 

516 

517 Parameters 

518 ---------- 

519 label : `str` 

520 Label of the task. 

521 key: `str` 

522 Fully-qualified field name. 

523 value : object 

524 Value to be given to a field. 

525 """ 

526 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

527 

528 def addConfigFile(self, label: str, filename: str) -> None: 

529 """Add overrides from a specified file. 

530 

531 Parameters 

532 ---------- 

533 label : `str` 

534 The label used to identify the task associated with config to 

535 modify 

536 filename : `str` 

537 Path to the override file. 

538 """ 

539 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

540 

541 def addConfigPython(self, label: str, pythonString: str) -> None: 

542 """Add Overrides by running a snippet of python code against a config. 

543 

544 Parameters 

545 ---------- 

546 label : `str` 

547 The label used to identity the task associated with config to 

548 modify. 

549 pythonString: `str` 

550 A string which is valid python code to be executed. This is done 

551 with config as the only local accessible value. 

552 """ 

553 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

554 

555 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

556 if label == "parameters": 

557 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

558 raise ValueError("Cannot override parameters that are not defined in pipeline") 

559 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

560 if newConfig.file: 

561 raise ValueError("Setting parameters section with config file is not supported") 

562 if newConfig.python: 

563 raise ValueError("Setting parameters section using python block in unsupported") 

564 return 

565 if label not in self._pipelineIR.tasks: 

566 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

567 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

568 

569 def toFile(self, filename: str) -> None: 

570 self._pipelineIR.to_file(filename) 

571 

572 def write_to_uri(self, uri: Union[str, ButlerURI]) -> None: 

573 # tasks need sorted each call because someone might have added or 

574 # removed task, and caching changes does not seem worth the small 

575 # overhead 

576 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

577 self._pipelineIR.reorder_tasks(labels) 

578 self._pipelineIR.write_to_uri(uri) 

579 

580 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

581 """Returns a generator of TaskDefs which can be used to create quantum 

582 graphs. 

583 

584 Returns 

585 ------- 

586 generator : generator of `TaskDef` 

587 The generator returned will be the sorted iterator of tasks which 

588 are to be used in constructing a quantum graph. 

589 

590 Raises 

591 ------ 

592 NotImplementedError 

593 If a dataId is supplied in a config block. This is in place for 

594 future use 

595 """ 

596 yield from self._toExpandedPipelineImpl() 

597 

598 def _toExpandedPipelineImpl(self, checkContracts=True) -> Iterable[TaskDef]: 

599 taskDefs = [] 

600 for label in self._pipelineIR.tasks: 

601 taskDefs.append(self._buildTaskDef(label)) 

602 

603 # lets evaluate the contracts 

604 if self._pipelineIR.contracts is not None: 

605 label_to_config = {x.label: x.config for x in taskDefs} 

606 for contract in self._pipelineIR.contracts: 

607 # execute this in its own line so it can raise a good error 

608 # message if there was problems with the eval 

609 success = eval(contract.contract, None, label_to_config) 

610 if not success: 

611 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

612 raise pipelineIR.ContractError(f"Contract(s) '{contract.contract}' were not " 

613 f"satisfied{extra_info}") 

614 

615 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

616 yield from pipeTools.orderPipeline(taskDefs) 

617 

618 def _buildTaskDef(self, label: str) -> TaskDef: 

619 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

620 raise NameError(f"Label {label} does not appear in this pipeline") 

621 taskClass = doImport(taskIR.klass) 

622 taskName = taskClass.__qualname__ 

623 config = taskClass.ConfigClass() 

624 overrides = ConfigOverrides() 

625 if self._pipelineIR.instrument is not None: 

626 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

627 if taskIR.config is not None: 

628 for configIR in (configIr.formatted(self._pipelineIR.parameters) 

629 for configIr in taskIR.config): 

630 if configIR.dataId is not None: 

631 raise NotImplementedError("Specializing a config on a partial data id is not yet " 

632 "supported in Pipeline definition") 

633 # only apply override if it applies to everything 

634 if configIR.dataId is None: 

635 if configIR.file: 

636 for configFile in configIR.file: 

637 overrides.addFileOverride(os.path.expandvars(configFile)) 

638 if configIR.python is not None: 

639 overrides.addPythonOverride(configIR.python) 

640 for key, value in configIR.rest.items(): 

641 overrides.addValueOverride(key, value) 

642 overrides.applyTo(config) 

643 # This may need to be revisited 

644 try: 

645 config.validate() 

646 except Exception: 

647 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

648 raise 

649 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

650 

651 def __iter__(self) -> Generator[TaskDef, None, None]: 

652 return self.toExpandedPipeline() 

653 

654 def __getitem__(self, item: str) -> TaskDef: 

655 return self._buildTaskDef(item) 

656 

657 def __len__(self): 

658 return len(self._pipelineIR.tasks) 

659 

660 def __eq__(self, other: object): 

661 if not isinstance(other, Pipeline): 

662 return False 

663 return self._pipelineIR == other._pipelineIR 

664 

665 

666@dataclass(frozen=True) 

667class TaskDatasetTypes: 

668 """An immutable struct that extracts and classifies the dataset types used 

669 by a `PipelineTask` 

670 """ 

671 

672 initInputs: NamedValueSet[DatasetType] 

673 """Dataset types that are needed as inputs in order to construct this Task. 

674 

675 Task-level `initInputs` may be classified as either 

676 `~PipelineDatasetTypes.initInputs` or 

677 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

678 """ 

679 

680 initOutputs: NamedValueSet[DatasetType] 

681 """Dataset types that may be written after constructing this Task. 

682 

683 Task-level `initOutputs` may be classified as either 

684 `~PipelineDatasetTypes.initOutputs` or 

685 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

686 """ 

687 

688 inputs: NamedValueSet[DatasetType] 

689 """Dataset types that are regular inputs to this Task. 

690 

691 If an input dataset needed for a Quantum cannot be found in the input 

692 collection(s) or produced by another Task in the Pipeline, that Quantum 

693 (and all dependent Quanta) will not be produced. 

694 

695 Task-level `inputs` may be classified as either 

696 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

697 at the Pipeline level. 

698 """ 

699 

700 prerequisites: NamedValueSet[DatasetType] 

701 """Dataset types that are prerequisite inputs to this Task. 

702 

703 Prerequisite inputs must exist in the input collection(s) before the 

704 pipeline is run, but do not constrain the graph - if a prerequisite is 

705 missing for a Quantum, `PrerequisiteMissingError` is raised. 

706 

707 Prerequisite inputs are not resolved until the second stage of 

708 QuantumGraph generation. 

709 """ 

710 

711 outputs: NamedValueSet[DatasetType] 

712 """Dataset types that are produced by this Task. 

713 

714 Task-level `outputs` may be classified as either 

715 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

716 at the Pipeline level. 

717 """ 

718 

719 @classmethod 

720 def fromTaskDef( 

721 cls, 

722 taskDef: TaskDef, 

723 *, 

724 registry: Registry, 

725 include_configs: bool = True, 

726 storage_class_mapping: Optional[Mapping[str, str]] = None 

727 ) -> TaskDatasetTypes: 

728 """Extract and classify the dataset types from a single `PipelineTask`. 

729 

730 Parameters 

731 ---------- 

732 taskDef: `TaskDef` 

733 An instance of a `TaskDef` class for a particular `PipelineTask`. 

734 registry: `Registry` 

735 Registry used to construct normalized `DatasetType` objects and 

736 retrieve those that are incomplete. 

737 include_configs : `bool`, optional 

738 If `True` (default) include config dataset types as 

739 ``initOutputs``. 

740 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

741 If a taskdef contains a component dataset type that is unknown 

742 to the registry, its parent StorageClass will be looked up in this 

743 mapping if it is supplied. If the mapping does not contain the 

744 composite dataset type, or the mapping is not supplied an exception 

745 will be raised. 

746 

747 Returns 

748 ------- 

749 types: `TaskDatasetTypes` 

750 The dataset types used by this task. 

751 

752 Raises 

753 ------ 

754 ValueError 

755 Raised if dataset type connection definition differs from 

756 registry definition. 

757 LookupError 

758 Raised if component parent StorageClass could not be determined 

759 and storage_class_mapping does not contain the composite type, or 

760 is set to None. 

761 """ 

762 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]: 

763 """Constructs a set of true `DatasetType` objects 

764 

765 Parameters 

766 ---------- 

767 connectionType : `str` 

768 Name of the connection type to produce a set for, corresponds 

769 to an attribute of type `list` on the connection class instance 

770 freeze : `bool`, optional 

771 If `True`, call `NamedValueSet.freeze` on the object returned. 

772 

773 Returns 

774 ------- 

775 datasetTypes : `NamedValueSet` 

776 A set of all datasetTypes which correspond to the input 

777 connection type specified in the connection class of this 

778 `PipelineTask` 

779 

780 Raises 

781 ------ 

782 ValueError 

783 Raised if dataset type connection definition differs from 

784 registry definition. 

785 LookupError 

786 Raised if component parent StorageClass could not be determined 

787 and storage_class_mapping does not contain the composite type, 

788 or is set to None. 

789 

790 Notes 

791 ----- 

792 This function is a closure over the variables ``registry`` and 

793 ``taskDef``, and ``storage_class_mapping``. 

794 """ 

795 datasetTypes = NamedValueSet() 

796 for c in iterConnections(taskDef.connections, connectionType): 

797 dimensions = set(getattr(c, 'dimensions', set())) 

798 if "skypix" in dimensions: 

799 try: 

800 datasetType = registry.getDatasetType(c.name) 

801 except LookupError as err: 

802 raise LookupError( 

803 f"DatasetType '{c.name}' referenced by " 

804 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

805 f"placeholder, but does not already exist in the registry. " 

806 f"Note that reference catalog names are now used as the dataset " 

807 f"type name instead of 'ref_cat'." 

808 ) from err 

809 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

810 rest2 = set(dim.name for dim in datasetType.dimensions 

811 if not isinstance(dim, SkyPixDimension)) 

812 if rest1 != rest2: 

813 raise ValueError(f"Non-skypix dimensions for dataset type {c.name} declared in " 

814 f"connections ({rest1}) are inconsistent with those in " 

815 f"registry's version of this dataset ({rest2}).") 

816 else: 

817 # Component dataset types are not explicitly in the 

818 # registry. This complicates consistency checks with 

819 # registry and requires we work out the composite storage 

820 # class. 

821 registryDatasetType = None 

822 try: 

823 registryDatasetType = registry.getDatasetType(c.name) 

824 except KeyError: 

825 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

826 if componentName: 

827 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

828 raise LookupError("Component parent class cannot be determined, and " 

829 "composite name was not in storage class mapping, or no " 

830 "storage_class_mapping was supplied") 

831 else: 

832 parentStorageClass = storage_class_mapping[compositeName] 

833 else: 

834 parentStorageClass = None 

835 datasetType = c.makeDatasetType( 

836 registry.dimensions, 

837 parentStorageClass=parentStorageClass 

838 ) 

839 registryDatasetType = datasetType 

840 else: 

841 datasetType = c.makeDatasetType( 

842 registry.dimensions, 

843 parentStorageClass=registryDatasetType.parentStorageClass 

844 ) 

845 

846 if registryDatasetType and datasetType != registryDatasetType: 

847 try: 

848 # Explicitly check for storage class just to make 

849 # more specific message. 

850 _ = datasetType.storageClass 

851 except KeyError: 

852 raise ValueError("Storage class does not exist for supplied dataset type " 

853 f"{datasetType} for {taskDef.label}.") from None 

854 raise ValueError(f"Supplied dataset type ({datasetType}) inconsistent with " 

855 f"registry definition ({registryDatasetType}) " 

856 f"for {taskDef.label}.") 

857 datasetTypes.add(datasetType) 

858 if freeze: 

859 datasetTypes.freeze() 

860 return datasetTypes 

861 

862 # optionally add initOutput dataset for config 

863 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False) 

864 if include_configs: 

865 initOutputs.add( 

866 DatasetType( 

867 taskDef.configDatasetName, 

868 registry.dimensions.empty, 

869 storageClass="Config", 

870 ) 

871 ) 

872 initOutputs.freeze() 

873 

874 # optionally add output dataset for metadata 

875 outputs = makeDatasetTypesSet("outputs", freeze=False) 

876 if taskDef.metadataDatasetName is not None: 

877 # Metadata is supposed to be of the PropertySet type, its 

878 # dimensions correspond to a task quantum 

879 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

880 if _TASK_METADATA_TYPE is TaskMetadata: 

881 storageClass = "TaskMetadata" 

882 else: 

883 storageClass = "PropertySet" 

884 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)} 

885 if taskDef.logOutputDatasetName is not None: 

886 # Log output dimensions correspond to a task quantum. 

887 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

888 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")} 

889 

890 outputs.freeze() 

891 

892 return cls( 

893 initInputs=makeDatasetTypesSet("initInputs"), 

894 initOutputs=initOutputs, 

895 inputs=makeDatasetTypesSet("inputs"), 

896 prerequisites=makeDatasetTypesSet("prerequisiteInputs"), 

897 outputs=outputs, 

898 ) 

899 

900 

901@dataclass(frozen=True) 

902class PipelineDatasetTypes: 

903 """An immutable struct that classifies the dataset types used in a 

904 `Pipeline`. 

905 """ 

906 

907 packagesDatasetName: ClassVar[str] = "packages" 

908 """Name of a dataset type used to save package versions. 

909 """ 

910 

911 initInputs: NamedValueSet[DatasetType] 

912 """Dataset types that are needed as inputs in order to construct the Tasks 

913 in this Pipeline. 

914 

915 This does not include dataset types that are produced when constructing 

916 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

917 """ 

918 

919 initOutputs: NamedValueSet[DatasetType] 

920 """Dataset types that may be written after constructing the Tasks in this 

921 Pipeline. 

922 

923 This does not include dataset types that are also used as inputs when 

924 constructing other Tasks in the Pipeline (these are classified as 

925 `initIntermediates`). 

926 """ 

927 

928 initIntermediates: NamedValueSet[DatasetType] 

929 """Dataset types that are both used when constructing one or more Tasks 

930 in the Pipeline and produced as a side-effect of constructing another 

931 Task in the Pipeline. 

932 """ 

933 

934 inputs: NamedValueSet[DatasetType] 

935 """Dataset types that are regular inputs for the full pipeline. 

936 

937 If an input dataset needed for a Quantum cannot be found in the input 

938 collection(s), that Quantum (and all dependent Quanta) will not be 

939 produced. 

940 """ 

941 

942 prerequisites: NamedValueSet[DatasetType] 

943 """Dataset types that are prerequisite inputs for the full Pipeline. 

944 

945 Prerequisite inputs must exist in the input collection(s) before the 

946 pipeline is run, but do not constrain the graph - if a prerequisite is 

947 missing for a Quantum, `PrerequisiteMissingError` is raised. 

948 

949 Prerequisite inputs are not resolved until the second stage of 

950 QuantumGraph generation. 

951 """ 

952 

953 intermediates: NamedValueSet[DatasetType] 

954 """Dataset types that are output by one Task in the Pipeline and consumed 

955 as inputs by one or more other Tasks in the Pipeline. 

956 """ 

957 

958 outputs: NamedValueSet[DatasetType] 

959 """Dataset types that are output by a Task in the Pipeline and not consumed 

960 by any other Task in the Pipeline. 

961 """ 

962 

963 byTask: Mapping[str, TaskDatasetTypes] 

964 """Per-Task dataset types, keyed by label in the `Pipeline`. 

965 

966 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

967 neither has been modified since the dataset types were extracted, of 

968 course). 

969 """ 

970 

971 @classmethod 

972 def fromPipeline( 

973 cls, 

974 pipeline: Union[Pipeline, Iterable[TaskDef]], 

975 *, 

976 registry: Registry, 

977 include_configs: bool = True, 

978 include_packages: bool = True, 

979 ) -> PipelineDatasetTypes: 

980 """Extract and classify the dataset types from all tasks in a 

981 `Pipeline`. 

982 

983 Parameters 

984 ---------- 

985 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

986 A collection of tasks that can be run together. 

987 registry: `Registry` 

988 Registry used to construct normalized `DatasetType` objects and 

989 retrieve those that are incomplete. 

990 include_configs : `bool`, optional 

991 If `True` (default) include config dataset types as 

992 ``initOutputs``. 

993 include_packages : `bool`, optional 

994 If `True` (default) include the dataset type for software package 

995 versions in ``initOutputs``. 

996 

997 Returns 

998 ------- 

999 types: `PipelineDatasetTypes` 

1000 The dataset types used by this `Pipeline`. 

1001 

1002 Raises 

1003 ------ 

1004 ValueError 

1005 Raised if Tasks are inconsistent about which datasets are marked 

1006 prerequisite. This indicates that the Tasks cannot be run as part 

1007 of the same `Pipeline`. 

1008 """ 

1009 allInputs = NamedValueSet() 

1010 allOutputs = NamedValueSet() 

1011 allInitInputs = NamedValueSet() 

1012 allInitOutputs = NamedValueSet() 

1013 prerequisites = NamedValueSet() 

1014 byTask = dict() 

1015 if include_packages: 

1016 allInitOutputs.add( 

1017 DatasetType( 

1018 cls.packagesDatasetName, 

1019 registry.dimensions.empty, 

1020 storageClass="Packages", 

1021 ) 

1022 ) 

1023 # create a list of TaskDefs in case the input is a generator 

1024 pipeline = list(pipeline) 

1025 

1026 # collect all the output dataset types 

1027 typeStorageclassMap: Dict[str, str] = {} 

1028 for taskDef in pipeline: 

1029 for outConnection in iterConnections(taskDef.connections, 'outputs'): 

1030 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1031 

1032 for taskDef in pipeline: 

1033 thisTask = TaskDatasetTypes.fromTaskDef( 

1034 taskDef, 

1035 registry=registry, 

1036 include_configs=include_configs, 

1037 storage_class_mapping=typeStorageclassMap 

1038 ) 

1039 allInitInputs |= thisTask.initInputs 

1040 allInitOutputs |= thisTask.initOutputs 

1041 allInputs |= thisTask.inputs 

1042 prerequisites |= thisTask.prerequisites 

1043 allOutputs |= thisTask.outputs 

1044 byTask[taskDef.label] = thisTask 

1045 if not prerequisites.isdisjoint(allInputs): 

1046 raise ValueError("{} marked as both prerequisites and regular inputs".format( 

1047 {dt.name for dt in allInputs & prerequisites} 

1048 )) 

1049 if not prerequisites.isdisjoint(allOutputs): 

1050 raise ValueError("{} marked as both prerequisites and outputs".format( 

1051 {dt.name for dt in allOutputs & prerequisites} 

1052 )) 

1053 # Make sure that components which are marked as inputs get treated as 

1054 # intermediates if there is an output which produces the composite 

1055 # containing the component 

1056 intermediateComponents = NamedValueSet() 

1057 intermediateComposites = NamedValueSet() 

1058 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1059 for dsType in allInputs: 

1060 # get the name of a possible component 

1061 name, component = dsType.nameAndComponent() 

1062 # if there is a component name, that means this is a component 

1063 # DatasetType, if there is an output which produces the parent of 

1064 # this component, treat this input as an intermediate 

1065 if component is not None: 

1066 # This needs to be in this if block, because someone might have 

1067 # a composite that is a pure input from existing data 

1068 if name in outputNameMapping: 

1069 intermediateComponents.add(dsType) 

1070 intermediateComposites.add(outputNameMapping[name]) 

1071 

1072 def checkConsistency(a: NamedValueSet, b: NamedValueSet): 

1073 common = a.names & b.names 

1074 for name in common: 

1075 if a[name] != b[name]: 

1076 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1077 

1078 checkConsistency(allInitInputs, allInitOutputs) 

1079 checkConsistency(allInputs, allOutputs) 

1080 checkConsistency(allInputs, intermediateComposites) 

1081 checkConsistency(allOutputs, intermediateComposites) 

1082 

1083 def frozen(s: NamedValueSet) -> NamedValueSet: 

1084 s.freeze() 

1085 return s 

1086 

1087 return cls( 

1088 initInputs=frozen(allInitInputs - allInitOutputs), 

1089 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1090 initOutputs=frozen(allInitOutputs - allInitInputs), 

1091 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1092 intermediates=frozen(allInputs & allOutputs | intermediateComponents), 

1093 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1094 prerequisites=frozen(prerequisites), 

1095 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1096 ) 

1097 

1098 @classmethod 

1099 def initOutputNames(cls, pipeline: Union[Pipeline, Iterable[TaskDef]], *, 

1100 include_configs: bool = True, include_packages: bool = True) -> Iterator[str]: 

1101 """Return the names of dataset types ot task initOutputs, Configs, 

1102 and package versions for a pipeline. 

1103 

1104 Parameters 

1105 ---------- 

1106 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1107 A `Pipeline` instance or collection of `TaskDef` instances. 

1108 include_configs : `bool`, optional 

1109 If `True` (default) include config dataset types. 

1110 include_packages : `bool`, optional 

1111 If `True` (default) include the dataset type for package versions. 

1112 

1113 Yields 

1114 ------ 

1115 datasetTypeName : `str` 

1116 Name of the dataset type. 

1117 """ 

1118 if include_packages: 

1119 # Package versions dataset type 

1120 yield cls.packagesDatasetName 

1121 

1122 if isinstance(pipeline, Pipeline): 

1123 pipeline = pipeline.toExpandedPipeline() 

1124 

1125 for taskDef in pipeline: 

1126 

1127 # all task InitOutputs 

1128 for name in taskDef.connections.initOutputs: 

1129 attribute = getattr(taskDef.connections, name) 

1130 yield attribute.name 

1131 

1132 # config dataset name 

1133 if include_configs: 

1134 yield taskDef.configDatasetName