Coverage for python/lsst/pipe/base/pipeline.py: 19%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

372 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31from dataclasses import dataclass 

32import logging 

33from types import MappingProxyType 

34from typing import (ClassVar, Dict, Iterable, Iterator, Mapping, Set, Union, 

35 Generator, TYPE_CHECKING, Optional, Tuple) 

36 

37import copy 

38import re 

39import os 

40import urllib.parse 

41import warnings 

42 

43# ----------------------------- 

44# Imports for other modules -- 

45from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension, ButlerURI 

46from lsst.utils import doImport 

47from .configOverrides import ConfigOverrides 

48from .connections import iterConnections 

49from .pipelineTask import PipelineTask 

50 

51from . import pipelineIR 

52from . import pipeTools 

53 

54if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 from lsst.obs.base import Instrument 

56 

57# ---------------------------------- 

58# Local non-exported definitions -- 

59# ---------------------------------- 

60 

61_LOG = logging.getLogger(__name__) 

62 

63# ------------------------ 

64# Exported definitions -- 

65# ------------------------ 

66 

67 

68@dataclass 

69class LabelSpecifier: 

70 """A structure to specify a subset of labels to load 

71 

72 This structure may contain a set of labels to be used in subsetting a 

73 pipeline, or a beginning and end point. Beginning or end may be empty, 

74 in which case the range will be a half open interval. Unlike python 

75 iteration bounds, end bounds are *INCLUDED*. Note that range based 

76 selection is not well defined for pipelines that are not linear in nature, 

77 and correct behavior is not guaranteed, or may vary from run to run. 

78 """ 

79 labels: Optional[Set[str]] = None 

80 begin: Optional[str] = None 

81 end: Optional[str] = None 

82 

83 def __post_init__(self): 

84 if self.labels is not None and (self.begin or self.end): 

85 raise ValueError("This struct can only be initialized with a labels set or " 

86 "a begin (and/or) end specifier") 

87 

88 

89class TaskDef: 

90 """TaskDef is a collection of information about task needed by Pipeline. 

91 

92 The information includes task name, configuration object and optional 

93 task class. This class is just a collection of attributes and it exposes 

94 all of them so that attributes could potentially be modified in place 

95 (e.g. if configuration needs extra overrides). 

96 

97 Attributes 

98 ---------- 

99 taskName : `str`, optional 

100 `PipelineTask` class name, currently it is not specified whether this 

101 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 

102 Framework should be prepared to handle all cases. If not provided, 

103 ``taskClass`` must be, and ``taskClass.__name__`` is used. 

104 config : `lsst.pex.config.Config`, optional 

105 Instance of the configuration class corresponding to this task class, 

106 usually with all overrides applied. This config will be frozen. If 

107 not provided, ``taskClass`` must be provided and 

108 ``taskClass.ConfigClass()`` will be used. 

109 taskClass : `type`, optional 

110 `PipelineTask` class object, can be ``None``. If ``None`` then 

111 framework will have to locate and load class. 

112 label : `str`, optional 

113 Task label, usually a short string unique in a pipeline. If not 

114 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

115 be used. 

116 """ 

117 def __init__(self, taskName=None, config=None, taskClass=None, label=None): 

118 if taskName is None: 

119 if taskClass is None: 

120 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

121 taskName = taskClass.__name__ 

122 if config is None: 

123 if taskClass is None: 

124 raise ValueError("`taskClass` must be provided if `config` is not.") 

125 config = taskClass.ConfigClass() 

126 if label is None: 

127 if taskClass is None: 

128 raise ValueError("`taskClass` must be provided if `label` is not.") 

129 label = taskClass._DefaultName 

130 self.taskName = taskName 

131 config.freeze() 

132 self.config = config 

133 self.taskClass = taskClass 

134 self.label = label 

135 self.connections = config.connections.ConnectionsClass(config=config) 

136 

137 @property 

138 def configDatasetName(self) -> str: 

139 """Name of a dataset type for configuration of this task (`str`) 

140 """ 

141 return self.label + "_config" 

142 

143 @property 

144 def metadataDatasetName(self) -> Optional[str]: 

145 """Name of a dataset type for metadata of this task, `None` if 

146 metadata is not to be saved (`str`) 

147 """ 

148 if self.config.saveMetadata: 

149 return self.label + "_metadata" 

150 else: 

151 return None 

152 

153 @property 

154 def logOutputDatasetName(self) -> Optional[str]: 

155 """Name of a dataset type for log output from this task, `None` if 

156 logs are not to be saved (`str`) 

157 """ 

158 if self.config.saveLogOutput: 

159 return self.label + "_log" 

160 else: 

161 return None 

162 

163 def __str__(self): 

164 rep = "TaskDef(" + self.taskName 

165 if self.label: 

166 rep += ", label=" + self.label 

167 rep += ")" 

168 return rep 

169 

170 def __eq__(self, other: object) -> bool: 

171 if not isinstance(other, TaskDef): 

172 return False 

173 # This does not consider equality of configs when determining equality 

174 # as config equality is a difficult thing to define. Should be updated 

175 # after DM-27847 

176 return self.taskClass == other.taskClass and self.label == other.label 

177 

178 def __hash__(self): 

179 return hash((self.taskClass, self.label)) 

180 

181 

182class Pipeline: 

183 """A `Pipeline` is a representation of a series of tasks to run, and the 

184 configuration for those tasks. 

185 

186 Parameters 

187 ---------- 

188 description : `str` 

189 A description of that this pipeline does. 

190 """ 

191 def __init__(self, description: str): 

192 pipeline_dict = {"description": description, "tasks": {}} 

193 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

194 

195 @classmethod 

196 def fromFile(cls, filename: str) -> Pipeline: 

197 """Load a pipeline defined in a pipeline yaml file. 

198 

199 Parameters 

200 ---------- 

201 filename: `str` 

202 A path that points to a pipeline defined in yaml format. This 

203 filename may also supply additional labels to be used in 

204 subsetting the loaded Pipeline. These labels are separated from 

205 the path by a \\#, and may be specified as a comma separated 

206 list, or a range denoted as beginning..end. Beginning or end may 

207 be empty, in which case the range will be a half open interval. 

208 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

209 that range based selection is not well defined for pipelines that 

210 are not linear in nature, and correct behavior is not guaranteed, 

211 or may vary from run to run. 

212 

213 Returns 

214 ------- 

215 pipeline: `Pipeline` 

216 The pipeline loaded from specified location with appropriate (if 

217 any) subsetting 

218 

219 Notes 

220 ----- 

221 This method attempts to prune any contracts that contain labels which 

222 are not in the declared subset of labels. This pruning is done using a 

223 string based matching due to the nature of contracts and may prune more 

224 than it should. 

225 """ 

226 return cls.from_uri(filename) 

227 

228 @classmethod 

229 def from_uri(cls, uri: Union[str, ButlerURI]) -> Pipeline: 

230 """Load a pipeline defined in a pipeline yaml file at a location 

231 specified by a URI. 

232 

233 Parameters 

234 ---------- 

235 uri: `str` or `ButlerURI` 

236 If a string is supplied this should be a URI path that points to a 

237 pipeline defined in yaml format. This uri may also supply 

238 additional labels to be used in subsetting the loaded Pipeline. 

239 These labels are separated from the path by a \\#, and may be 

240 specified as a comma separated list, or a range denoted as 

241 beginning..end. Beginning or end may be empty, in which case the 

242 range will be a half open interval. Unlike python iteration 

243 bounds, end bounds are *INCLUDED*. Note that range based selection 

244 is not well defined for pipelines that are not linear in nature, 

245 and correct behavior is not guaranteed, or may vary from run to 

246 run. The same specifiers can be used with a ButlerURI object, by 

247 being the sole contents in the fragments attribute. 

248 

249 Returns 

250 ------- 

251 pipeline: `Pipeline` 

252 The pipeline loaded from specified location with appropriate (if 

253 any) subsetting 

254 

255 Notes 

256 ----- 

257 This method attempts to prune any contracts that contain labels which 

258 are not in the declared subset of labels. This pruning is done using a 

259 string based matching due to the nature of contracts and may prune more 

260 than it should. 

261 """ 

262 # Split up the uri and any labels that were supplied 

263 uri, label_specifier = cls._parse_file_specifier(uri) 

264 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

265 

266 # If there are labels supplied, only keep those 

267 if label_specifier is not None: 

268 pipeline = pipeline.subsetFromLabels(label_specifier) 

269 return pipeline 

270 

271 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

272 """Subset a pipeline to contain only labels specified in labelSpecifier 

273 

274 Parameters 

275 ---------- 

276 labelSpecifier : `labelSpecifier` 

277 Object containing labels that describes how to subset a pipeline. 

278 

279 Returns 

280 ------- 

281 pipeline : `Pipeline` 

282 A new pipeline object that is a subset of the old pipeline 

283 

284 Raises 

285 ------ 

286 ValueError 

287 Raised if there is an issue with specified labels 

288 

289 Notes 

290 ----- 

291 This method attempts to prune any contracts that contain labels which 

292 are not in the declared subset of labels. This pruning is done using a 

293 string based matching due to the nature of contracts and may prune more 

294 than it should. 

295 """ 

296 # Labels supplied as a set 

297 if labelSpecifier.labels: 

298 labelSet = labelSpecifier.labels 

299 # Labels supplied as a range, first create a list of all the labels 

300 # in the pipeline sorted according to task dependency. Then only 

301 # keep labels that lie between the supplied bounds 

302 else: 

303 # Create a copy of the pipeline to use when assessing the label 

304 # ordering. Use a dict for fast searching while preserving order. 

305 # Remove contracts so they do not fail in the expansion step. This 

306 # is needed because a user may only configure the tasks they intend 

307 # to run, which may cause some contracts to fail if they will later 

308 # be dropped 

309 pipeline = copy.deepcopy(self) 

310 pipeline._pipelineIR.contracts = [] 

311 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

312 

313 # Verify the bounds are in the labels 

314 if labelSpecifier.begin is not None: 

315 if labelSpecifier.begin not in labels: 

316 raise ValueError(f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

317 "pipeline definition") 

318 if labelSpecifier.end is not None: 

319 if labelSpecifier.end not in labels: 

320 raise ValueError(f"End of range subset, {labelSpecifier.end}, not found in pipeline " 

321 "definition") 

322 

323 labelSet = set() 

324 for label in labels: 

325 if labelSpecifier.begin is not None: 

326 if label != labelSpecifier.begin: 

327 continue 

328 else: 

329 labelSpecifier.begin = None 

330 labelSet.add(label) 

331 if labelSpecifier.end is not None and label == labelSpecifier.end: 

332 break 

333 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

334 

335 @staticmethod 

336 def _parse_file_specifier(uri: Union[str, ButlerURI] 

337 ) -> Tuple[ButlerURI, Optional[LabelSpecifier]]: 

338 """Split appart a uri and any possible label subsets 

339 """ 

340 if isinstance(uri, str): 

341 # This is to support legacy pipelines during transition 

342 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

343 if num_replace: 

344 warnings.warn(f"The pipeline file {uri} seems to use the legacy : to separate " 

345 "labels, this is deprecated and will be removed after June 2021, please use " 

346 "# instead.", 

347 category=FutureWarning) 

348 if uri.count("#") > 1: 

349 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

350 uri = ButlerURI(uri) 

351 label_subset = uri.fragment or None 

352 

353 specifier: Optional[LabelSpecifier] 

354 if label_subset is not None: 

355 label_subset = urllib.parse.unquote(label_subset) 

356 args: Dict[str, Union[Set[str], str, None]] 

357 # labels supplied as a list 

358 if ',' in label_subset: 

359 if '..' in label_subset: 

360 raise ValueError("Can only specify a list of labels or a range" 

361 "when loading a Pipline not both") 

362 args = {"labels": set(label_subset.split(","))} 

363 # labels supplied as a range 

364 elif '..' in label_subset: 

365 # Try to de-structure the labelSubset, this will fail if more 

366 # than one range is specified 

367 begin, end, *rest = label_subset.split("..") 

368 if rest: 

369 raise ValueError("Only one range can be specified when loading a pipeline") 

370 args = {"begin": begin if begin else None, "end": end if end else None} 

371 # Assume anything else is a single label 

372 else: 

373 args = {"labels": {label_subset}} 

374 

375 specifier = LabelSpecifier(**args) 

376 else: 

377 specifier = None 

378 

379 return uri, specifier 

380 

381 @classmethod 

382 def fromString(cls, pipeline_string: str) -> Pipeline: 

383 """Create a pipeline from string formatted as a pipeline document. 

384 

385 Parameters 

386 ---------- 

387 pipeline_string : `str` 

388 A string that is formatted according like a pipeline document 

389 

390 Returns 

391 ------- 

392 pipeline: `Pipeline` 

393 """ 

394 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

395 return pipeline 

396 

397 @classmethod 

398 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

399 """Create a pipeline from an already created `PipelineIR` object. 

400 

401 Parameters 

402 ---------- 

403 deserialized_pipeline: `PipelineIR` 

404 An already created pipeline intermediate representation object 

405 

406 Returns 

407 ------- 

408 pipeline: `Pipeline` 

409 """ 

410 pipeline = cls.__new__(cls) 

411 pipeline._pipelineIR = deserialized_pipeline 

412 return pipeline 

413 

414 @classmethod 

415 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline: 

416 """Create a new pipeline by copying an already existing `Pipeline`. 

417 

418 Parameters 

419 ---------- 

420 pipeline: `Pipeline` 

421 An already created pipeline intermediate representation object 

422 

423 Returns 

424 ------- 

425 pipeline: `Pipeline` 

426 """ 

427 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

428 

429 def __str__(self) -> str: 

430 return str(self._pipelineIR) 

431 

432 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

433 """Add an instrument to the pipeline, or replace an instrument that is 

434 already defined. 

435 

436 Parameters 

437 ---------- 

438 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

439 Either a derived class object of a `lsst.daf.butler.instrument` or 

440 a string corresponding to a fully qualified 

441 `lsst.daf.butler.instrument` name. 

442 """ 

443 if isinstance(instrument, str): 

444 pass 

445 else: 

446 # TODO: assume that this is a subclass of Instrument, no type 

447 # checking 

448 instrument = f"{instrument.__module__}.{instrument.__qualname__}" 

449 self._pipelineIR.instrument = instrument 

450 

451 def getInstrument(self) -> Instrument: 

452 """Get the instrument from the pipeline. 

453 

454 Returns 

455 ------- 

456 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None 

457 A derived class object of a `lsst.daf.butler.instrument`, a string 

458 corresponding to a fully qualified `lsst.daf.butler.instrument` 

459 name, or None if the pipeline does not have an instrument. 

460 """ 

461 return self._pipelineIR.instrument 

462 

463 def addTask(self, task: Union[PipelineTask, str], label: str) -> None: 

464 """Add a new task to the pipeline, or replace a task that is already 

465 associated with the supplied label. 

466 

467 Parameters 

468 ---------- 

469 task: `PipelineTask` or `str` 

470 Either a derived class object of a `PipelineTask` or a string 

471 corresponding to a fully qualified `PipelineTask` name. 

472 label: `str` 

473 A label that is used to identify the `PipelineTask` being added 

474 """ 

475 if isinstance(task, str): 

476 taskName = task 

477 elif issubclass(task, PipelineTask): 

478 taskName = f"{task.__module__}.{task.__qualname__}" 

479 else: 

480 raise ValueError("task must be either a child class of PipelineTask or a string containing" 

481 " a fully qualified name to one") 

482 if not label: 

483 # in some cases (with command line-generated pipeline) tasks can 

484 # be defined without label which is not acceptable, use task 

485 # _DefaultName in that case 

486 if isinstance(task, str): 

487 task = doImport(task) 

488 label = task._DefaultName 

489 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

490 

491 def removeTask(self, label: str) -> None: 

492 """Remove a task from the pipeline. 

493 

494 Parameters 

495 ---------- 

496 label : `str` 

497 The label used to identify the task that is to be removed 

498 

499 Raises 

500 ------ 

501 KeyError 

502 If no task with that label exists in the pipeline 

503 

504 """ 

505 self._pipelineIR.tasks.pop(label) 

506 

507 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

508 """Apply single config override. 

509 

510 Parameters 

511 ---------- 

512 label : `str` 

513 Label of the task. 

514 key: `str` 

515 Fully-qualified field name. 

516 value : object 

517 Value to be given to a field. 

518 """ 

519 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

520 

521 def addConfigFile(self, label: str, filename: str) -> None: 

522 """Add overrides from a specified file. 

523 

524 Parameters 

525 ---------- 

526 label : `str` 

527 The label used to identify the task associated with config to 

528 modify 

529 filename : `str` 

530 Path to the override file. 

531 """ 

532 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

533 

534 def addConfigPython(self, label: str, pythonString: str) -> None: 

535 """Add Overrides by running a snippet of python code against a config. 

536 

537 Parameters 

538 ---------- 

539 label : `str` 

540 The label used to identity the task associated with config to 

541 modify. 

542 pythonString: `str` 

543 A string which is valid python code to be executed. This is done 

544 with config as the only local accessible value. 

545 """ 

546 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

547 

548 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

549 if label == "parameters": 

550 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

551 raise ValueError("Cannot override parameters that are not defined in pipeline") 

552 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

553 if newConfig.file: 

554 raise ValueError("Setting parameters section with config file is not supported") 

555 if newConfig.python: 

556 raise ValueError("Setting parameters section using python block in unsupported") 

557 return 

558 if label not in self._pipelineIR.tasks: 

559 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

560 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

561 

562 def toFile(self, filename: str) -> None: 

563 self._pipelineIR.to_file(filename) 

564 

565 def write_to_uri(self, uri: Union[str, ButlerURI]) -> None: 

566 self._pipelineIR.write_to_uri(uri) 

567 

568 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

569 """Returns a generator of TaskDefs which can be used to create quantum 

570 graphs. 

571 

572 Returns 

573 ------- 

574 generator : generator of `TaskDef` 

575 The generator returned will be the sorted iterator of tasks which 

576 are to be used in constructing a quantum graph. 

577 

578 Raises 

579 ------ 

580 NotImplementedError 

581 If a dataId is supplied in a config block. This is in place for 

582 future use 

583 """ 

584 taskDefs = [] 

585 for label in self._pipelineIR.tasks: 

586 taskDefs.append(self._buildTaskDef(label)) 

587 

588 # lets evaluate the contracts 

589 if self._pipelineIR.contracts is not None: 

590 label_to_config = {x.label: x.config for x in taskDefs} 

591 for contract in self._pipelineIR.contracts: 

592 # execute this in its own line so it can raise a good error 

593 # message if there was problems with the eval 

594 success = eval(contract.contract, None, label_to_config) 

595 if not success: 

596 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

597 raise pipelineIR.ContractError(f"Contract(s) '{contract.contract}' were not " 

598 f"satisfied{extra_info}") 

599 

600 yield from pipeTools.orderPipeline(taskDefs) 

601 

602 def _buildTaskDef(self, label: str) -> TaskDef: 

603 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

604 raise NameError(f"Label {label} does not appear in this pipeline") 

605 taskClass = doImport(taskIR.klass) 

606 taskName = taskClass.__qualname__ 

607 config = taskClass.ConfigClass() 

608 overrides = ConfigOverrides() 

609 if self._pipelineIR.instrument is not None: 

610 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

611 if taskIR.config is not None: 

612 for configIR in (configIr.formatted(self._pipelineIR.parameters) 

613 for configIr in taskIR.config): 

614 if configIR.dataId is not None: 

615 raise NotImplementedError("Specializing a config on a partial data id is not yet " 

616 "supported in Pipeline definition") 

617 # only apply override if it applies to everything 

618 if configIR.dataId is None: 

619 if configIR.file: 

620 for configFile in configIR.file: 

621 overrides.addFileOverride(os.path.expandvars(configFile)) 

622 if configIR.python is not None: 

623 overrides.addPythonOverride(configIR.python) 

624 for key, value in configIR.rest.items(): 

625 overrides.addValueOverride(key, value) 

626 overrides.applyTo(config) 

627 # This may need to be revisited 

628 try: 

629 config.validate() 

630 except Exception: 

631 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

632 raise 

633 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

634 

635 def __iter__(self) -> Generator[TaskDef, None, None]: 

636 return self.toExpandedPipeline() 

637 

638 def __getitem__(self, item: str) -> TaskDef: 

639 return self._buildTaskDef(item) 

640 

641 def __len__(self): 

642 return len(self._pipelineIR.tasks) 

643 

644 def __eq__(self, other: object): 

645 if not isinstance(other, Pipeline): 

646 return False 

647 return self._pipelineIR == other._pipelineIR 

648 

649 

650@dataclass(frozen=True) 

651class TaskDatasetTypes: 

652 """An immutable struct that extracts and classifies the dataset types used 

653 by a `PipelineTask` 

654 """ 

655 

656 initInputs: NamedValueSet[DatasetType] 

657 """Dataset types that are needed as inputs in order to construct this Task. 

658 

659 Task-level `initInputs` may be classified as either 

660 `~PipelineDatasetTypes.initInputs` or 

661 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

662 """ 

663 

664 initOutputs: NamedValueSet[DatasetType] 

665 """Dataset types that may be written after constructing this Task. 

666 

667 Task-level `initOutputs` may be classified as either 

668 `~PipelineDatasetTypes.initOutputs` or 

669 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

670 """ 

671 

672 inputs: NamedValueSet[DatasetType] 

673 """Dataset types that are regular inputs to this Task. 

674 

675 If an input dataset needed for a Quantum cannot be found in the input 

676 collection(s) or produced by another Task in the Pipeline, that Quantum 

677 (and all dependent Quanta) will not be produced. 

678 

679 Task-level `inputs` may be classified as either 

680 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

681 at the Pipeline level. 

682 """ 

683 

684 prerequisites: NamedValueSet[DatasetType] 

685 """Dataset types that are prerequisite inputs to this Task. 

686 

687 Prerequisite inputs must exist in the input collection(s) before the 

688 pipeline is run, but do not constrain the graph - if a prerequisite is 

689 missing for a Quantum, `PrerequisiteMissingError` is raised. 

690 

691 Prerequisite inputs are not resolved until the second stage of 

692 QuantumGraph generation. 

693 """ 

694 

695 outputs: NamedValueSet[DatasetType] 

696 """Dataset types that are produced by this Task. 

697 

698 Task-level `outputs` may be classified as either 

699 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

700 at the Pipeline level. 

701 """ 

702 

703 @classmethod 

704 def fromTaskDef( 

705 cls, 

706 taskDef: TaskDef, 

707 *, 

708 registry: Registry, 

709 include_configs: bool = True, 

710 storage_class_mapping: Optional[Mapping[str, str]] = None 

711 ) -> TaskDatasetTypes: 

712 """Extract and classify the dataset types from a single `PipelineTask`. 

713 

714 Parameters 

715 ---------- 

716 taskDef: `TaskDef` 

717 An instance of a `TaskDef` class for a particular `PipelineTask`. 

718 registry: `Registry` 

719 Registry used to construct normalized `DatasetType` objects and 

720 retrieve those that are incomplete. 

721 include_configs : `bool`, optional 

722 If `True` (default) include config dataset types as 

723 ``initOutputs``. 

724 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

725 If a taskdef contains a component dataset type that is unknown 

726 to the registry, its parent StorageClass will be looked up in this 

727 mapping if it is supplied. If the mapping does not contain the 

728 composite dataset type, or the mapping is not supplied an exception 

729 will be raised. 

730 

731 Returns 

732 ------- 

733 types: `TaskDatasetTypes` 

734 The dataset types used by this task. 

735 

736 Raises 

737 ------ 

738 ValueError 

739 Raised if dataset type connection definition differs from 

740 registry definition. 

741 LookupError 

742 Raised if component parent StorageClass could not be determined 

743 and storage_class_mapping does not contain the composite type, or 

744 is set to None. 

745 """ 

746 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]: 

747 """Constructs a set of true `DatasetType` objects 

748 

749 Parameters 

750 ---------- 

751 connectionType : `str` 

752 Name of the connection type to produce a set for, corresponds 

753 to an attribute of type `list` on the connection class instance 

754 freeze : `bool`, optional 

755 If `True`, call `NamedValueSet.freeze` on the object returned. 

756 

757 Returns 

758 ------- 

759 datasetTypes : `NamedValueSet` 

760 A set of all datasetTypes which correspond to the input 

761 connection type specified in the connection class of this 

762 `PipelineTask` 

763 

764 Raises 

765 ------ 

766 ValueError 

767 Raised if dataset type connection definition differs from 

768 registry definition. 

769 LookupError 

770 Raised if component parent StorageClass could not be determined 

771 and storage_class_mapping does not contain the composite type, 

772 or is set to None. 

773 

774 Notes 

775 ----- 

776 This function is a closure over the variables ``registry`` and 

777 ``taskDef``, and ``storage_class_mapping``. 

778 """ 

779 datasetTypes = NamedValueSet() 

780 for c in iterConnections(taskDef.connections, connectionType): 

781 dimensions = set(getattr(c, 'dimensions', set())) 

782 if "skypix" in dimensions: 

783 try: 

784 datasetType = registry.getDatasetType(c.name) 

785 except LookupError as err: 

786 raise LookupError( 

787 f"DatasetType '{c.name}' referenced by " 

788 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

789 f"placeholder, but does not already exist in the registry. " 

790 f"Note that reference catalog names are now used as the dataset " 

791 f"type name instead of 'ref_cat'." 

792 ) from err 

793 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

794 rest2 = set(dim.name for dim in datasetType.dimensions 

795 if not isinstance(dim, SkyPixDimension)) 

796 if rest1 != rest2: 

797 raise ValueError(f"Non-skypix dimensions for dataset type {c.name} declared in " 

798 f"connections ({rest1}) are inconsistent with those in " 

799 f"registry's version of this dataset ({rest2}).") 

800 else: 

801 # Component dataset types are not explicitly in the 

802 # registry. This complicates consistency checks with 

803 # registry and requires we work out the composite storage 

804 # class. 

805 registryDatasetType = None 

806 try: 

807 registryDatasetType = registry.getDatasetType(c.name) 

808 except KeyError: 

809 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

810 if componentName: 

811 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

812 raise LookupError("Component parent class cannot be determined, and " 

813 "composite name was not in storage class mapping, or no " 

814 "storage_class_mapping was supplied") 

815 else: 

816 parentStorageClass = storage_class_mapping[compositeName] 

817 else: 

818 parentStorageClass = None 

819 datasetType = c.makeDatasetType( 

820 registry.dimensions, 

821 parentStorageClass=parentStorageClass 

822 ) 

823 registryDatasetType = datasetType 

824 else: 

825 datasetType = c.makeDatasetType( 

826 registry.dimensions, 

827 parentStorageClass=registryDatasetType.parentStorageClass 

828 ) 

829 

830 if registryDatasetType and datasetType != registryDatasetType: 

831 try: 

832 # Explicitly check for storage class just to make 

833 # more specific message. 

834 _ = datasetType.storageClass 

835 except KeyError: 

836 raise ValueError("Storage class does not exist for supplied dataset type " 

837 f"{datasetType} for {taskDef.label}.") from None 

838 raise ValueError(f"Supplied dataset type ({datasetType}) inconsistent with " 

839 f"registry definition ({registryDatasetType}) " 

840 f"for {taskDef.label}.") 

841 datasetTypes.add(datasetType) 

842 if freeze: 

843 datasetTypes.freeze() 

844 return datasetTypes 

845 

846 # optionally add initOutput dataset for config 

847 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False) 

848 if include_configs: 

849 initOutputs.add( 

850 DatasetType( 

851 taskDef.configDatasetName, 

852 registry.dimensions.empty, 

853 storageClass="Config", 

854 ) 

855 ) 

856 initOutputs.freeze() 

857 

858 # optionally add output dataset for metadata 

859 outputs = makeDatasetTypesSet("outputs", freeze=False) 

860 if taskDef.metadataDatasetName is not None: 

861 # Metadata is supposed to be of the PropertySet type, its 

862 # dimensions correspond to a task quantum 

863 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

864 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, "PropertySet")} 

865 if taskDef.logOutputDatasetName is not None: 

866 # Log output dimensions correspond to a task quantum. 

867 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

868 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")} 

869 

870 outputs.freeze() 

871 

872 return cls( 

873 initInputs=makeDatasetTypesSet("initInputs"), 

874 initOutputs=initOutputs, 

875 inputs=makeDatasetTypesSet("inputs"), 

876 prerequisites=makeDatasetTypesSet("prerequisiteInputs"), 

877 outputs=outputs, 

878 ) 

879 

880 

881@dataclass(frozen=True) 

882class PipelineDatasetTypes: 

883 """An immutable struct that classifies the dataset types used in a 

884 `Pipeline`. 

885 """ 

886 

887 packagesDatasetName: ClassVar[str] = "packages" 

888 """Name of a dataset type used to save package versions. 

889 """ 

890 

891 initInputs: NamedValueSet[DatasetType] 

892 """Dataset types that are needed as inputs in order to construct the Tasks 

893 in this Pipeline. 

894 

895 This does not include dataset types that are produced when constructing 

896 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

897 """ 

898 

899 initOutputs: NamedValueSet[DatasetType] 

900 """Dataset types that may be written after constructing the Tasks in this 

901 Pipeline. 

902 

903 This does not include dataset types that are also used as inputs when 

904 constructing other Tasks in the Pipeline (these are classified as 

905 `initIntermediates`). 

906 """ 

907 

908 initIntermediates: NamedValueSet[DatasetType] 

909 """Dataset types that are both used when constructing one or more Tasks 

910 in the Pipeline and produced as a side-effect of constructing another 

911 Task in the Pipeline. 

912 """ 

913 

914 inputs: NamedValueSet[DatasetType] 

915 """Dataset types that are regular inputs for the full pipeline. 

916 

917 If an input dataset needed for a Quantum cannot be found in the input 

918 collection(s), that Quantum (and all dependent Quanta) will not be 

919 produced. 

920 """ 

921 

922 prerequisites: NamedValueSet[DatasetType] 

923 """Dataset types that are prerequisite inputs for the full Pipeline. 

924 

925 Prerequisite inputs must exist in the input collection(s) before the 

926 pipeline is run, but do not constrain the graph - if a prerequisite is 

927 missing for a Quantum, `PrerequisiteMissingError` is raised. 

928 

929 Prerequisite inputs are not resolved until the second stage of 

930 QuantumGraph generation. 

931 """ 

932 

933 intermediates: NamedValueSet[DatasetType] 

934 """Dataset types that are output by one Task in the Pipeline and consumed 

935 as inputs by one or more other Tasks in the Pipeline. 

936 """ 

937 

938 outputs: NamedValueSet[DatasetType] 

939 """Dataset types that are output by a Task in the Pipeline and not consumed 

940 by any other Task in the Pipeline. 

941 """ 

942 

943 byTask: Mapping[str, TaskDatasetTypes] 

944 """Per-Task dataset types, keyed by label in the `Pipeline`. 

945 

946 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

947 neither has been modified since the dataset types were extracted, of 

948 course). 

949 """ 

950 

951 @classmethod 

952 def fromPipeline( 

953 cls, 

954 pipeline: Union[Pipeline, Iterable[TaskDef]], 

955 *, 

956 registry: Registry, 

957 include_configs: bool = True, 

958 include_packages: bool = True, 

959 ) -> PipelineDatasetTypes: 

960 """Extract and classify the dataset types from all tasks in a 

961 `Pipeline`. 

962 

963 Parameters 

964 ---------- 

965 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

966 A collection of tasks that can be run together. 

967 registry: `Registry` 

968 Registry used to construct normalized `DatasetType` objects and 

969 retrieve those that are incomplete. 

970 include_configs : `bool`, optional 

971 If `True` (default) include config dataset types as 

972 ``initOutputs``. 

973 include_packages : `bool`, optional 

974 If `True` (default) include the dataset type for software package 

975 versions in ``initOutputs``. 

976 

977 Returns 

978 ------- 

979 types: `PipelineDatasetTypes` 

980 The dataset types used by this `Pipeline`. 

981 

982 Raises 

983 ------ 

984 ValueError 

985 Raised if Tasks are inconsistent about which datasets are marked 

986 prerequisite. This indicates that the Tasks cannot be run as part 

987 of the same `Pipeline`. 

988 """ 

989 allInputs = NamedValueSet() 

990 allOutputs = NamedValueSet() 

991 allInitInputs = NamedValueSet() 

992 allInitOutputs = NamedValueSet() 

993 prerequisites = NamedValueSet() 

994 byTask = dict() 

995 if include_packages: 

996 allInitOutputs.add( 

997 DatasetType( 

998 cls.packagesDatasetName, 

999 registry.dimensions.empty, 

1000 storageClass="Packages", 

1001 ) 

1002 ) 

1003 # create a list of TaskDefs in case the input is a generator 

1004 pipeline = list(pipeline) 

1005 

1006 # collect all the output dataset types 

1007 typeStorageclassMap: Dict[str, str] = {} 

1008 for taskDef in pipeline: 

1009 for outConnection in iterConnections(taskDef.connections, 'outputs'): 

1010 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1011 

1012 for taskDef in pipeline: 

1013 thisTask = TaskDatasetTypes.fromTaskDef( 

1014 taskDef, 

1015 registry=registry, 

1016 include_configs=include_configs, 

1017 storage_class_mapping=typeStorageclassMap 

1018 ) 

1019 allInitInputs |= thisTask.initInputs 

1020 allInitOutputs |= thisTask.initOutputs 

1021 allInputs |= thisTask.inputs 

1022 prerequisites |= thisTask.prerequisites 

1023 allOutputs |= thisTask.outputs 

1024 byTask[taskDef.label] = thisTask 

1025 if not prerequisites.isdisjoint(allInputs): 

1026 raise ValueError("{} marked as both prerequisites and regular inputs".format( 

1027 {dt.name for dt in allInputs & prerequisites} 

1028 )) 

1029 if not prerequisites.isdisjoint(allOutputs): 

1030 raise ValueError("{} marked as both prerequisites and outputs".format( 

1031 {dt.name for dt in allOutputs & prerequisites} 

1032 )) 

1033 # Make sure that components which are marked as inputs get treated as 

1034 # intermediates if there is an output which produces the composite 

1035 # containing the component 

1036 intermediateComponents = NamedValueSet() 

1037 intermediateComposites = NamedValueSet() 

1038 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1039 for dsType in allInputs: 

1040 # get the name of a possible component 

1041 name, component = dsType.nameAndComponent() 

1042 # if there is a component name, that means this is a component 

1043 # DatasetType, if there is an output which produces the parent of 

1044 # this component, treat this input as an intermediate 

1045 if component is not None: 

1046 # This needs to be in this if block, because someone might have 

1047 # a composite that is a pure input from existing data 

1048 if name in outputNameMapping: 

1049 intermediateComponents.add(dsType) 

1050 intermediateComposites.add(outputNameMapping[name]) 

1051 

1052 def checkConsistency(a: NamedValueSet, b: NamedValueSet): 

1053 common = a.names & b.names 

1054 for name in common: 

1055 if a[name] != b[name]: 

1056 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1057 

1058 checkConsistency(allInitInputs, allInitOutputs) 

1059 checkConsistency(allInputs, allOutputs) 

1060 checkConsistency(allInputs, intermediateComposites) 

1061 checkConsistency(allOutputs, intermediateComposites) 

1062 

1063 def frozen(s: NamedValueSet) -> NamedValueSet: 

1064 s.freeze() 

1065 return s 

1066 

1067 return cls( 

1068 initInputs=frozen(allInitInputs - allInitOutputs), 

1069 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1070 initOutputs=frozen(allInitOutputs - allInitInputs), 

1071 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1072 intermediates=frozen(allInputs & allOutputs | intermediateComponents), 

1073 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1074 prerequisites=frozen(prerequisites), 

1075 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1076 ) 

1077 

1078 @classmethod 

1079 def initOutputNames(cls, pipeline: Union[Pipeline, Iterable[TaskDef]], *, 

1080 include_configs: bool = True, include_packages: bool = True) -> Iterator[str]: 

1081 """Return the names of dataset types ot task initOutputs, Configs, 

1082 and package versions for a pipeline. 

1083 

1084 Parameters 

1085 ---------- 

1086 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1087 A `Pipeline` instance or collection of `TaskDef` instances. 

1088 include_configs : `bool`, optional 

1089 If `True` (default) include config dataset types. 

1090 include_packages : `bool`, optional 

1091 If `True` (default) include the dataset type for package versions. 

1092 

1093 Yields 

1094 ------ 

1095 datasetTypeName : `str` 

1096 Name of the dataset type. 

1097 """ 

1098 if include_packages: 

1099 # Package versions dataset type 

1100 yield cls.packagesDatasetName 

1101 

1102 if isinstance(pipeline, Pipeline): 

1103 pipeline = pipeline.toExpandedPipeline() 

1104 

1105 for taskDef in pipeline: 

1106 

1107 # all task InitOutputs 

1108 for name in taskDef.connections.initOutputs: 

1109 attribute = getattr(taskDef.connections, name) 

1110 yield attribute.name 

1111 

1112 # config dataset name 

1113 if include_configs: 

1114 yield taskDef.configDatasetName