Coverage for python/lsst/pipe/base/pipeline.py: 19%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

348 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31from dataclasses import dataclass 

32import logging 

33from types import MappingProxyType 

34from typing import (ClassVar, Dict, Iterable, Iterator, Mapping, Set, Union, 

35 Generator, TYPE_CHECKING, Optional, Tuple) 

36 

37import copy 

38import re 

39import os 

40import urllib.parse 

41import warnings 

42 

43# ----------------------------- 

44# Imports for other modules -- 

45from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension, ButlerURI 

46from lsst.utils import doImport 

47from .configOverrides import ConfigOverrides 

48from .connections import iterConnections 

49from .pipelineTask import PipelineTask 

50 

51from . import pipelineIR 

52from . import pipeTools 

53 

54if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 from lsst.obs.base import Instrument 

56 

57# ---------------------------------- 

58# Local non-exported definitions -- 

59# ---------------------------------- 

60 

61_LOG = logging.getLogger(__name__) 

62 

63# ------------------------ 

64# Exported definitions -- 

65# ------------------------ 

66 

67 

68@dataclass 

69class LabelSpecifier: 

70 """A structure to specify a subset of labels to load 

71 

72 This structure may contain a set of labels to be used in subsetting a 

73 pipeline, or a beginning and end point. Beginning or end may be empty, 

74 in which case the range will be a half open interval. Unlike python 

75 iteration bounds, end bounds are *INCLUDED*. Note that range based 

76 selection is not well defined for pipelines that are not linear in nature, 

77 and correct behavior is not guaranteed, or may vary from run to run. 

78 """ 

79 labels: Optional[Set[str]] = None 

80 begin: Optional[str] = None 

81 end: Optional[str] = None 

82 

83 def __post_init__(self): 

84 if self.labels is not None and (self.begin or self.end): 

85 raise ValueError("This struct can only be initialized with a labels set or " 

86 "a begin (and/or) end specifier") 

87 

88 

89class TaskDef: 

90 """TaskDef is a collection of information about task needed by Pipeline. 

91 

92 The information includes task name, configuration object and optional 

93 task class. This class is just a collection of attributes and it exposes 

94 all of them so that attributes could potentially be modified in place 

95 (e.g. if configuration needs extra overrides). 

96 

97 Attributes 

98 ---------- 

99 taskName : `str` 

100 `PipelineTask` class name, currently it is not specified whether this 

101 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 

102 Framework should be prepared to handle all cases. 

103 config : `lsst.pex.config.Config` 

104 Instance of the configuration class corresponding to this task class, 

105 usually with all overrides applied. This config will be frozen. 

106 taskClass : `type` or ``None`` 

107 `PipelineTask` class object, can be ``None``. If ``None`` then 

108 framework will have to locate and load class. 

109 label : `str`, optional 

110 Task label, usually a short string unique in a pipeline. 

111 """ 

112 def __init__(self, taskName, config, taskClass=None, label=""): 

113 self.taskName = taskName 

114 config.freeze() 

115 self.config = config 

116 self.taskClass = taskClass 

117 self.label = label 

118 self.connections = config.connections.ConnectionsClass(config=config) 

119 

120 @property 

121 def configDatasetName(self) -> str: 

122 """Name of a dataset type for configuration of this task (`str`) 

123 """ 

124 return self.label + "_config" 

125 

126 @property 

127 def metadataDatasetName(self) -> Optional[str]: 

128 """Name of a dataset type for metadata of this task, `None` if 

129 metadata is not to be saved (`str`) 

130 """ 

131 if self.config.saveMetadata: 

132 return self.label + "_metadata" 

133 else: 

134 return None 

135 

136 @property 

137 def logOutputDatasetName(self) -> Optional[str]: 

138 """Name of a dataset type for log output from this task, `None` if 

139 logs are not to be saved (`str`) 

140 """ 

141 if self.config.saveLogOutput: 

142 return self.label + "_log" 

143 else: 

144 return None 

145 

146 def __str__(self): 

147 rep = "TaskDef(" + self.taskName 

148 if self.label: 

149 rep += ", label=" + self.label 

150 rep += ")" 

151 return rep 

152 

153 def __eq__(self, other: object) -> bool: 

154 if not isinstance(other, TaskDef): 

155 return False 

156 # This does not consider equality of configs when determining equality 

157 # as config equality is a difficult thing to define. Should be updated 

158 # after DM-27847 

159 return self.taskClass == other.taskClass and self.label == other.label 

160 

161 def __hash__(self): 

162 return hash((self.taskClass, self.label)) 

163 

164 

165class Pipeline: 

166 """A `Pipeline` is a representation of a series of tasks to run, and the 

167 configuration for those tasks. 

168 

169 Parameters 

170 ---------- 

171 description : `str` 

172 A description of that this pipeline does. 

173 """ 

174 def __init__(self, description: str): 

175 pipeline_dict = {"description": description, "tasks": {}} 

176 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

177 

178 @classmethod 

179 def fromFile(cls, filename: str) -> Pipeline: 

180 """Load a pipeline defined in a pipeline yaml file. 

181 

182 Parameters 

183 ---------- 

184 filename: `str` 

185 A path that points to a pipeline defined in yaml format. This 

186 filename may also supply additional labels to be used in 

187 subsetting the loaded Pipeline. These labels are separated from 

188 the path by a \\#, and may be specified as a comma separated 

189 list, or a range denoted as beginning..end. Beginning or end may 

190 be empty, in which case the range will be a half open interval. 

191 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

192 that range based selection is not well defined for pipelines that 

193 are not linear in nature, and correct behavior is not guaranteed, 

194 or may vary from run to run. 

195 

196 Returns 

197 ------- 

198 pipeline: `Pipeline` 

199 The pipeline loaded from specified location with appropriate (if 

200 any) subsetting 

201 

202 Notes 

203 ----- 

204 This method attempts to prune any contracts that contain labels which 

205 are not in the declared subset of labels. This pruning is done using a 

206 string based matching due to the nature of contracts and may prune more 

207 than it should. 

208 """ 

209 return cls.from_uri(filename) 

210 

211 @classmethod 

212 def from_uri(cls, uri: Union[str, ButlerURI]) -> Pipeline: 

213 """Load a pipeline defined in a pipeline yaml file at a location 

214 specified by a URI. 

215 

216 Parameters 

217 ---------- 

218 uri: `str` or `ButlerURI` 

219 If a string is supplied this should be a URI path that points to a 

220 pipeline defined in yaml format. This uri may also supply 

221 additional labels to be used in subsetting the loaded Pipeline. 

222 These labels are separated from the path by a \\#, and may be 

223 specified as a comma separated list, or a range denoted as 

224 beginning..end. Beginning or end may be empty, in which case the 

225 range will be a half open interval. Unlike python iteration 

226 bounds, end bounds are *INCLUDED*. Note that range based selection 

227 is not well defined for pipelines that are not linear in nature, 

228 and correct behavior is not guaranteed, or may vary from run to 

229 run. The same specifiers can be used with a ButlerURI object, by 

230 being the sole contents in the fragments attribute. 

231 

232 Returns 

233 ------- 

234 pipeline: `Pipeline` 

235 The pipeline loaded from specified location with appropriate (if 

236 any) subsetting 

237 

238 Notes 

239 ----- 

240 This method attempts to prune any contracts that contain labels which 

241 are not in the declared subset of labels. This pruning is done using a 

242 string based matching due to the nature of contracts and may prune more 

243 than it should. 

244 """ 

245 # Split up the uri and any labels that were supplied 

246 uri, label_specifier = cls._parse_file_specifier(uri) 

247 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

248 

249 # If there are labels supplied, only keep those 

250 if label_specifier is not None: 

251 pipeline = pipeline.subsetFromLabels(label_specifier) 

252 return pipeline 

253 

254 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

255 """Subset a pipeline to contain only labels specified in labelSpecifier 

256 

257 Parameters 

258 ---------- 

259 labelSpecifier : `labelSpecifier` 

260 Object containing labels that describes how to subset a pipeline. 

261 

262 Returns 

263 ------- 

264 pipeline : `Pipeline` 

265 A new pipeline object that is a subset of the old pipeline 

266 

267 Raises 

268 ------ 

269 ValueError 

270 Raised if there is an issue with specified labels 

271 

272 Notes 

273 ----- 

274 This method attempts to prune any contracts that contain labels which 

275 are not in the declared subset of labels. This pruning is done using a 

276 string based matching due to the nature of contracts and may prune more 

277 than it should. 

278 """ 

279 # Labels supplied as a set 

280 if labelSpecifier.labels: 

281 labelSet = labelSpecifier.labels 

282 # Labels supplied as a range, first create a list of all the labels 

283 # in the pipeline sorted according to task dependency. Then only 

284 # keep labels that lie between the supplied bounds 

285 else: 

286 # Create a copy of the pipeline to use when assessing the label 

287 # ordering. Use a dict for fast searching while preserving order. 

288 # Remove contracts so they do not fail in the expansion step. This 

289 # is needed because a user may only configure the tasks they intend 

290 # to run, which may cause some contracts to fail if they will later 

291 # be dropped 

292 pipeline = copy.deepcopy(self) 

293 pipeline._pipelineIR.contracts = [] 

294 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

295 

296 # Verify the bounds are in the labels 

297 if labelSpecifier.begin is not None: 

298 if labelSpecifier.begin not in labels: 

299 raise ValueError(f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

300 "pipeline definition") 

301 if labelSpecifier.end is not None: 

302 if labelSpecifier.end not in labels: 

303 raise ValueError(f"End of range subset, {labelSpecifier.end}, not found in pipeline " 

304 "definition") 

305 

306 labelSet = set() 

307 for label in labels: 

308 if labelSpecifier.begin is not None: 

309 if label != labelSpecifier.begin: 

310 continue 

311 else: 

312 labelSpecifier.begin = None 

313 labelSet.add(label) 

314 if labelSpecifier.end is not None and label == labelSpecifier.end: 

315 break 

316 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

317 

318 @staticmethod 

319 def _parse_file_specifier(uri: Union[str, ButlerURI] 

320 ) -> Tuple[ButlerURI, Optional[LabelSpecifier]]: 

321 """Split appart a uri and any possible label subsets 

322 """ 

323 if isinstance(uri, str): 

324 # This is to support legacy pipelines during transition 

325 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

326 if num_replace: 

327 warnings.warn(f"The pipeline file {uri} seems to use the legacy : to separate " 

328 "labels, this is deprecated and will be removed after June 2021, please use " 

329 "# instead.", 

330 category=FutureWarning) 

331 if uri.count("#") > 1: 

332 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

333 uri = ButlerURI(uri) 

334 label_subset = uri.fragment or None 

335 

336 specifier: Optional[LabelSpecifier] 

337 if label_subset is not None: 

338 label_subset = urllib.parse.unquote(label_subset) 

339 args: Dict[str, Union[Set[str], str, None]] 

340 # labels supplied as a list 

341 if ',' in label_subset: 

342 if '..' in label_subset: 

343 raise ValueError("Can only specify a list of labels or a range" 

344 "when loading a Pipline not both") 

345 args = {"labels": set(label_subset.split(","))} 

346 # labels supplied as a range 

347 elif '..' in label_subset: 

348 # Try to de-structure the labelSubset, this will fail if more 

349 # than one range is specified 

350 begin, end, *rest = label_subset.split("..") 

351 if rest: 

352 raise ValueError("Only one range can be specified when loading a pipeline") 

353 args = {"begin": begin if begin else None, "end": end if end else None} 

354 # Assume anything else is a single label 

355 else: 

356 args = {"labels": {label_subset}} 

357 

358 specifier = LabelSpecifier(**args) 

359 else: 

360 specifier = None 

361 

362 return uri, specifier 

363 

364 @classmethod 

365 def fromString(cls, pipeline_string: str) -> Pipeline: 

366 """Create a pipeline from string formatted as a pipeline document. 

367 

368 Parameters 

369 ---------- 

370 pipeline_string : `str` 

371 A string that is formatted according like a pipeline document 

372 

373 Returns 

374 ------- 

375 pipeline: `Pipeline` 

376 """ 

377 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

378 return pipeline 

379 

380 @classmethod 

381 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

382 """Create a pipeline from an already created `PipelineIR` object. 

383 

384 Parameters 

385 ---------- 

386 deserialized_pipeline: `PipelineIR` 

387 An already created pipeline intermediate representation object 

388 

389 Returns 

390 ------- 

391 pipeline: `Pipeline` 

392 """ 

393 pipeline = cls.__new__(cls) 

394 pipeline._pipelineIR = deserialized_pipeline 

395 return pipeline 

396 

397 @classmethod 

398 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline: 

399 """Create a new pipeline by copying an already existing `Pipeline`. 

400 

401 Parameters 

402 ---------- 

403 pipeline: `Pipeline` 

404 An already created pipeline intermediate representation object 

405 

406 Returns 

407 ------- 

408 pipeline: `Pipeline` 

409 """ 

410 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

411 

412 def __str__(self) -> str: 

413 return str(self._pipelineIR) 

414 

415 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

416 """Add an instrument to the pipeline, or replace an instrument that is 

417 already defined. 

418 

419 Parameters 

420 ---------- 

421 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

422 Either a derived class object of a `lsst.daf.butler.instrument` or 

423 a string corresponding to a fully qualified 

424 `lsst.daf.butler.instrument` name. 

425 """ 

426 if isinstance(instrument, str): 

427 pass 

428 else: 

429 # TODO: assume that this is a subclass of Instrument, no type 

430 # checking 

431 instrument = f"{instrument.__module__}.{instrument.__qualname__}" 

432 self._pipelineIR.instrument = instrument 

433 

434 def getInstrument(self) -> Instrument: 

435 """Get the instrument from the pipeline. 

436 

437 Returns 

438 ------- 

439 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None 

440 A derived class object of a `lsst.daf.butler.instrument`, a string 

441 corresponding to a fully qualified `lsst.daf.butler.instrument` 

442 name, or None if the pipeline does not have an instrument. 

443 """ 

444 return self._pipelineIR.instrument 

445 

446 def addTask(self, task: Union[PipelineTask, str], label: str) -> None: 

447 """Add a new task to the pipeline, or replace a task that is already 

448 associated with the supplied label. 

449 

450 Parameters 

451 ---------- 

452 task: `PipelineTask` or `str` 

453 Either a derived class object of a `PipelineTask` or a string 

454 corresponding to a fully qualified `PipelineTask` name. 

455 label: `str` 

456 A label that is used to identify the `PipelineTask` being added 

457 """ 

458 if isinstance(task, str): 

459 taskName = task 

460 elif issubclass(task, PipelineTask): 

461 taskName = f"{task.__module__}.{task.__qualname__}" 

462 else: 

463 raise ValueError("task must be either a child class of PipelineTask or a string containing" 

464 " a fully qualified name to one") 

465 if not label: 

466 # in some cases (with command line-generated pipeline) tasks can 

467 # be defined without label which is not acceptable, use task 

468 # _DefaultName in that case 

469 if isinstance(task, str): 

470 task = doImport(task) 

471 label = task._DefaultName 

472 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

473 

474 def removeTask(self, label: str) -> None: 

475 """Remove a task from the pipeline. 

476 

477 Parameters 

478 ---------- 

479 label : `str` 

480 The label used to identify the task that is to be removed 

481 

482 Raises 

483 ------ 

484 KeyError 

485 If no task with that label exists in the pipeline 

486 

487 """ 

488 self._pipelineIR.tasks.pop(label) 

489 

490 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

491 """Apply single config override. 

492 

493 Parameters 

494 ---------- 

495 label : `str` 

496 Label of the task. 

497 key: `str` 

498 Fully-qualified field name. 

499 value : object 

500 Value to be given to a field. 

501 """ 

502 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

503 

504 def addConfigFile(self, label: str, filename: str) -> None: 

505 """Add overrides from a specified file. 

506 

507 Parameters 

508 ---------- 

509 label : `str` 

510 The label used to identify the task associated with config to 

511 modify 

512 filename : `str` 

513 Path to the override file. 

514 """ 

515 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

516 

517 def addConfigPython(self, label: str, pythonString: str) -> None: 

518 """Add Overrides by running a snippet of python code against a config. 

519 

520 Parameters 

521 ---------- 

522 label : `str` 

523 The label used to identity the task associated with config to 

524 modify. 

525 pythonString: `str` 

526 A string which is valid python code to be executed. This is done 

527 with config as the only local accessible value. 

528 """ 

529 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

530 

531 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

532 if label == "parameters": 

533 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

534 raise ValueError("Cannot override parameters that are not defined in pipeline") 

535 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

536 if newConfig.file: 

537 raise ValueError("Setting parameters section with config file is not supported") 

538 if newConfig.python: 

539 raise ValueError("Setting parameters section using python block in unsupported") 

540 return 

541 if label not in self._pipelineIR.tasks: 

542 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

543 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

544 

545 def toFile(self, filename: str) -> None: 

546 self._pipelineIR.to_file(filename) 

547 

548 def write_to_uri(self, uri: Union[str, ButlerURI]) -> None: 

549 self._pipelineIR.write_to_uri(uri) 

550 

551 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

552 """Returns a generator of TaskDefs which can be used to create quantum 

553 graphs. 

554 

555 Returns 

556 ------- 

557 generator : generator of `TaskDef` 

558 The generator returned will be the sorted iterator of tasks which 

559 are to be used in constructing a quantum graph. 

560 

561 Raises 

562 ------ 

563 NotImplementedError 

564 If a dataId is supplied in a config block. This is in place for 

565 future use 

566 """ 

567 taskDefs = [] 

568 for label, taskIR in self._pipelineIR.tasks.items(): 

569 taskClass = doImport(taskIR.klass) 

570 taskName = taskClass.__qualname__ 

571 config = taskClass.ConfigClass() 

572 overrides = ConfigOverrides() 

573 if self._pipelineIR.instrument is not None: 

574 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

575 if taskIR.config is not None: 

576 for configIR in (configIr.formatted(self._pipelineIR.parameters) 

577 for configIr in taskIR.config): 

578 if configIR.dataId is not None: 

579 raise NotImplementedError("Specializing a config on a partial data id is not yet " 

580 "supported in Pipeline definition") 

581 # only apply override if it applies to everything 

582 if configIR.dataId is None: 

583 if configIR.file: 

584 for configFile in configIR.file: 

585 overrides.addFileOverride(os.path.expandvars(configFile)) 

586 if configIR.python is not None: 

587 overrides.addPythonOverride(configIR.python) 

588 for key, value in configIR.rest.items(): 

589 overrides.addValueOverride(key, value) 

590 overrides.applyTo(config) 

591 # This may need to be revisited 

592 try: 

593 config.validate() 

594 except Exception: 

595 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

596 raise 

597 taskDefs.append(TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)) 

598 

599 # lets evaluate the contracts 

600 if self._pipelineIR.contracts is not None: 

601 label_to_config = {x.label: x.config for x in taskDefs} 

602 for contract in self._pipelineIR.contracts: 

603 # execute this in its own line so it can raise a good error 

604 # message if there was problems with the eval 

605 success = eval(contract.contract, None, label_to_config) 

606 if not success: 

607 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

608 raise pipelineIR.ContractError(f"Contract(s) '{contract.contract}' were not " 

609 f"satisfied{extra_info}") 

610 

611 yield from pipeTools.orderPipeline(taskDefs) 

612 

613 def __len__(self): 

614 return len(self._pipelineIR.tasks) 

615 

616 def __eq__(self, other: object): 

617 if not isinstance(other, Pipeline): 

618 return False 

619 return self._pipelineIR == other._pipelineIR 

620 

621 

622@dataclass(frozen=True) 

623class TaskDatasetTypes: 

624 """An immutable struct that extracts and classifies the dataset types used 

625 by a `PipelineTask` 

626 """ 

627 

628 initInputs: NamedValueSet[DatasetType] 

629 """Dataset types that are needed as inputs in order to construct this Task. 

630 

631 Task-level `initInputs` may be classified as either 

632 `~PipelineDatasetTypes.initInputs` or 

633 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

634 """ 

635 

636 initOutputs: NamedValueSet[DatasetType] 

637 """Dataset types that may be written after constructing this Task. 

638 

639 Task-level `initOutputs` may be classified as either 

640 `~PipelineDatasetTypes.initOutputs` or 

641 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

642 """ 

643 

644 inputs: NamedValueSet[DatasetType] 

645 """Dataset types that are regular inputs to this Task. 

646 

647 If an input dataset needed for a Quantum cannot be found in the input 

648 collection(s) or produced by another Task in the Pipeline, that Quantum 

649 (and all dependent Quanta) will not be produced. 

650 

651 Task-level `inputs` may be classified as either 

652 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

653 at the Pipeline level. 

654 """ 

655 

656 prerequisites: NamedValueSet[DatasetType] 

657 """Dataset types that are prerequisite inputs to this Task. 

658 

659 Prerequisite inputs must exist in the input collection(s) before the 

660 pipeline is run, but do not constrain the graph - if a prerequisite is 

661 missing for a Quantum, `PrerequisiteMissingError` is raised. 

662 

663 Prerequisite inputs are not resolved until the second stage of 

664 QuantumGraph generation. 

665 """ 

666 

667 outputs: NamedValueSet[DatasetType] 

668 """Dataset types that are produced by this Task. 

669 

670 Task-level `outputs` may be classified as either 

671 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

672 at the Pipeline level. 

673 """ 

674 

675 @classmethod 

676 def fromTaskDef( 

677 cls, 

678 taskDef: TaskDef, 

679 *, 

680 registry: Registry, 

681 include_configs: bool = True, 

682 ) -> TaskDatasetTypes: 

683 """Extract and classify the dataset types from a single `PipelineTask`. 

684 

685 Parameters 

686 ---------- 

687 taskDef: `TaskDef` 

688 An instance of a `TaskDef` class for a particular `PipelineTask`. 

689 registry: `Registry` 

690 Registry used to construct normalized `DatasetType` objects and 

691 retrieve those that are incomplete. 

692 include_configs : `bool`, optional 

693 If `True` (default) include config dataset types as 

694 ``initOutputs``. 

695 

696 Returns 

697 ------- 

698 types: `TaskDatasetTypes` 

699 The dataset types used by this task. 

700 """ 

701 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]: 

702 """Constructs a set of true `DatasetType` objects 

703 

704 Parameters 

705 ---------- 

706 connectionType : `str` 

707 Name of the connection type to produce a set for, corresponds 

708 to an attribute of type `list` on the connection class instance 

709 freeze : `bool`, optional 

710 If `True`, call `NamedValueSet.freeze` on the object returned. 

711 

712 Returns 

713 ------- 

714 datasetTypes : `NamedValueSet` 

715 A set of all datasetTypes which correspond to the input 

716 connection type specified in the connection class of this 

717 `PipelineTask` 

718 

719 Notes 

720 ----- 

721 This function is a closure over the variables ``registry`` and 

722 ``taskDef``. 

723 """ 

724 datasetTypes = NamedValueSet() 

725 for c in iterConnections(taskDef.connections, connectionType): 

726 dimensions = set(getattr(c, 'dimensions', set())) 

727 if "skypix" in dimensions: 

728 try: 

729 datasetType = registry.getDatasetType(c.name) 

730 except LookupError as err: 

731 raise LookupError( 

732 f"DatasetType '{c.name}' referenced by " 

733 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

734 f"placeholder, but does not already exist in the registry. " 

735 f"Note that reference catalog names are now used as the dataset " 

736 f"type name instead of 'ref_cat'." 

737 ) from err 

738 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

739 rest2 = set(dim.name for dim in datasetType.dimensions 

740 if not isinstance(dim, SkyPixDimension)) 

741 if rest1 != rest2: 

742 raise ValueError(f"Non-skypix dimensions for dataset type {c.name} declared in " 

743 f"connections ({rest1}) are inconsistent with those in " 

744 f"registry's version of this dataset ({rest2}).") 

745 else: 

746 # Component dataset types are not explicitly in the 

747 # registry. This complicates consistency checks with 

748 # registry and requires we work out the composite storage 

749 # class. 

750 registryDatasetType = None 

751 try: 

752 registryDatasetType = registry.getDatasetType(c.name) 

753 except KeyError: 

754 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

755 parentStorageClass = DatasetType.PlaceholderParentStorageClass \ 

756 if componentName else None 

757 datasetType = c.makeDatasetType( 

758 registry.dimensions, 

759 parentStorageClass=parentStorageClass 

760 ) 

761 registryDatasetType = datasetType 

762 else: 

763 datasetType = c.makeDatasetType( 

764 registry.dimensions, 

765 parentStorageClass=registryDatasetType.parentStorageClass 

766 ) 

767 

768 if registryDatasetType and datasetType != registryDatasetType: 

769 try: 

770 # Explicitly check for storage class just to make 

771 # more specific message. 

772 _ = datasetType.storageClass 

773 except KeyError: 

774 raise ValueError("Storage class does not exist for supplied dataset type " 

775 f"{datasetType} for {taskDef.label}.") from None 

776 raise ValueError(f"Supplied dataset type ({datasetType}) inconsistent with " 

777 f"registry definition ({registryDatasetType}) " 

778 f"for {taskDef.label}.") 

779 datasetTypes.add(datasetType) 

780 if freeze: 

781 datasetTypes.freeze() 

782 return datasetTypes 

783 

784 # optionally add initOutput dataset for config 

785 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False) 

786 if include_configs: 

787 initOutputs.add( 

788 DatasetType( 

789 taskDef.configDatasetName, 

790 registry.dimensions.empty, 

791 storageClass="Config", 

792 ) 

793 ) 

794 initOutputs.freeze() 

795 

796 # optionally add output dataset for metadata 

797 outputs = makeDatasetTypesSet("outputs", freeze=False) 

798 if taskDef.metadataDatasetName is not None: 

799 # Metadata is supposed to be of the PropertySet type, its 

800 # dimensions correspond to a task quantum 

801 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

802 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, "PropertySet")} 

803 if taskDef.logOutputDatasetName is not None: 

804 # Log output dimensions correspond to a task quantum. 

805 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

806 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")} 

807 

808 outputs.freeze() 

809 

810 return cls( 

811 initInputs=makeDatasetTypesSet("initInputs"), 

812 initOutputs=initOutputs, 

813 inputs=makeDatasetTypesSet("inputs"), 

814 prerequisites=makeDatasetTypesSet("prerequisiteInputs"), 

815 outputs=outputs, 

816 ) 

817 

818 

819@dataclass(frozen=True) 

820class PipelineDatasetTypes: 

821 """An immutable struct that classifies the dataset types used in a 

822 `Pipeline`. 

823 """ 

824 

825 packagesDatasetName: ClassVar[str] = "packages" 

826 """Name of a dataset type used to save package versions. 

827 """ 

828 

829 initInputs: NamedValueSet[DatasetType] 

830 """Dataset types that are needed as inputs in order to construct the Tasks 

831 in this Pipeline. 

832 

833 This does not include dataset types that are produced when constructing 

834 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

835 """ 

836 

837 initOutputs: NamedValueSet[DatasetType] 

838 """Dataset types that may be written after constructing the Tasks in this 

839 Pipeline. 

840 

841 This does not include dataset types that are also used as inputs when 

842 constructing other Tasks in the Pipeline (these are classified as 

843 `initIntermediates`). 

844 """ 

845 

846 initIntermediates: NamedValueSet[DatasetType] 

847 """Dataset types that are both used when constructing one or more Tasks 

848 in the Pipeline and produced as a side-effect of constructing another 

849 Task in the Pipeline. 

850 """ 

851 

852 inputs: NamedValueSet[DatasetType] 

853 """Dataset types that are regular inputs for the full pipeline. 

854 

855 If an input dataset needed for a Quantum cannot be found in the input 

856 collection(s), that Quantum (and all dependent Quanta) will not be 

857 produced. 

858 """ 

859 

860 prerequisites: NamedValueSet[DatasetType] 

861 """Dataset types that are prerequisite inputs for the full Pipeline. 

862 

863 Prerequisite inputs must exist in the input collection(s) before the 

864 pipeline is run, but do not constrain the graph - if a prerequisite is 

865 missing for a Quantum, `PrerequisiteMissingError` is raised. 

866 

867 Prerequisite inputs are not resolved until the second stage of 

868 QuantumGraph generation. 

869 """ 

870 

871 intermediates: NamedValueSet[DatasetType] 

872 """Dataset types that are output by one Task in the Pipeline and consumed 

873 as inputs by one or more other Tasks in the Pipeline. 

874 """ 

875 

876 outputs: NamedValueSet[DatasetType] 

877 """Dataset types that are output by a Task in the Pipeline and not consumed 

878 by any other Task in the Pipeline. 

879 """ 

880 

881 byTask: Mapping[str, TaskDatasetTypes] 

882 """Per-Task dataset types, keyed by label in the `Pipeline`. 

883 

884 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

885 neither has been modified since the dataset types were extracted, of 

886 course). 

887 """ 

888 

889 @classmethod 

890 def fromPipeline( 

891 cls, 

892 pipeline: Union[Pipeline, Iterable[TaskDef]], 

893 *, 

894 registry: Registry, 

895 include_configs: bool = True, 

896 include_packages: bool = True, 

897 ) -> PipelineDatasetTypes: 

898 """Extract and classify the dataset types from all tasks in a 

899 `Pipeline`. 

900 

901 Parameters 

902 ---------- 

903 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

904 A dependency-ordered collection of tasks that can be run 

905 together. 

906 registry: `Registry` 

907 Registry used to construct normalized `DatasetType` objects and 

908 retrieve those that are incomplete. 

909 include_configs : `bool`, optional 

910 If `True` (default) include config dataset types as 

911 ``initOutputs``. 

912 include_packages : `bool`, optional 

913 If `True` (default) include the dataset type for software package 

914 versions in ``initOutputs``. 

915 

916 Returns 

917 ------- 

918 types: `PipelineDatasetTypes` 

919 The dataset types used by this `Pipeline`. 

920 

921 Raises 

922 ------ 

923 ValueError 

924 Raised if Tasks are inconsistent about which datasets are marked 

925 prerequisite. This indicates that the Tasks cannot be run as part 

926 of the same `Pipeline`. 

927 """ 

928 allInputs = NamedValueSet() 

929 allOutputs = NamedValueSet() 

930 allInitInputs = NamedValueSet() 

931 allInitOutputs = NamedValueSet() 

932 prerequisites = NamedValueSet() 

933 byTask = dict() 

934 if include_packages: 

935 allInitOutputs.add( 

936 DatasetType( 

937 cls.packagesDatasetName, 

938 registry.dimensions.empty, 

939 storageClass="Packages", 

940 ) 

941 ) 

942 if isinstance(pipeline, Pipeline): 

943 pipeline = pipeline.toExpandedPipeline() 

944 for taskDef in pipeline: 

945 thisTask = TaskDatasetTypes.fromTaskDef( 

946 taskDef, 

947 registry=registry, 

948 include_configs=include_configs, 

949 ) 

950 allInitInputs |= thisTask.initInputs 

951 allInitOutputs |= thisTask.initOutputs 

952 allInputs |= thisTask.inputs 

953 prerequisites |= thisTask.prerequisites 

954 allOutputs |= thisTask.outputs 

955 byTask[taskDef.label] = thisTask 

956 if not prerequisites.isdisjoint(allInputs): 

957 raise ValueError("{} marked as both prerequisites and regular inputs".format( 

958 {dt.name for dt in allInputs & prerequisites} 

959 )) 

960 if not prerequisites.isdisjoint(allOutputs): 

961 raise ValueError("{} marked as both prerequisites and outputs".format( 

962 {dt.name for dt in allOutputs & prerequisites} 

963 )) 

964 # Make sure that components which are marked as inputs get treated as 

965 # intermediates if there is an output which produces the composite 

966 # containing the component 

967 intermediateComponents = NamedValueSet() 

968 intermediateComposites = NamedValueSet() 

969 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

970 for dsType in allInputs: 

971 # get the name of a possible component 

972 name, component = dsType.nameAndComponent() 

973 # if there is a component name, that means this is a component 

974 # DatasetType, if there is an output which produces the parent of 

975 # this component, treat this input as an intermediate 

976 if component is not None: 

977 if name in outputNameMapping: 

978 if outputNameMapping[name].dimensions != dsType.dimensions: 

979 raise ValueError(f"Component dataset type {dsType.name} has different " 

980 f"dimensions ({dsType.dimensions}) than its parent " 

981 f"({outputNameMapping[name].dimensions}).") 

982 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass, 

983 universe=registry.dimensions) 

984 intermediateComponents.add(dsType) 

985 intermediateComposites.add(composite) 

986 

987 def checkConsistency(a: NamedValueSet, b: NamedValueSet): 

988 common = a.names & b.names 

989 for name in common: 

990 if a[name] != b[name]: 

991 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

992 

993 checkConsistency(allInitInputs, allInitOutputs) 

994 checkConsistency(allInputs, allOutputs) 

995 checkConsistency(allInputs, intermediateComposites) 

996 checkConsistency(allOutputs, intermediateComposites) 

997 

998 def frozen(s: NamedValueSet) -> NamedValueSet: 

999 s.freeze() 

1000 return s 

1001 

1002 return cls( 

1003 initInputs=frozen(allInitInputs - allInitOutputs), 

1004 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1005 initOutputs=frozen(allInitOutputs - allInitInputs), 

1006 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1007 intermediates=frozen(allInputs & allOutputs | intermediateComponents), 

1008 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1009 prerequisites=frozen(prerequisites), 

1010 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1011 ) 

1012 

1013 @classmethod 

1014 def initOutputNames(cls, pipeline: Union[Pipeline, Iterable[TaskDef]], *, 

1015 include_configs: bool = True, include_packages: bool = True) -> Iterator[str]: 

1016 """Return the names of dataset types ot task initOutputs, Configs, 

1017 and package versions for a pipeline. 

1018 

1019 Parameters 

1020 ---------- 

1021 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1022 A `Pipeline` instance or collection of `TaskDef` instances. 

1023 include_configs : `bool`, optional 

1024 If `True` (default) include config dataset types. 

1025 include_packages : `bool`, optional 

1026 If `True` (default) include the dataset type for package versions. 

1027 

1028 Yields 

1029 ------ 

1030 datasetTypeName : `str` 

1031 Name of the dataset type. 

1032 """ 

1033 if include_packages: 

1034 # Package versions dataset type 

1035 yield cls.packagesDatasetName 

1036 

1037 if isinstance(pipeline, Pipeline): 

1038 pipeline = pipeline.toExpandedPipeline() 

1039 

1040 for taskDef in pipeline: 

1041 

1042 # all task InitOutputs 

1043 for name in taskDef.connections.initOutputs: 

1044 attribute = getattr(taskDef.connections, name) 

1045 yield attribute.name 

1046 

1047 # config dataset name 

1048 if include_configs: 

1049 yield taskDef.configDatasetName