Coverage for python/lsst/pipe/base/pipeline.py: 19%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

386 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import os 

31import re 

32import urllib.parse 

33import warnings 

34 

35# ------------------------------- 

36# Imports of standard modules -- 

37# ------------------------------- 

38from dataclasses import dataclass 

39from types import MappingProxyType 

40from typing import ( 

41 TYPE_CHECKING, 

42 ClassVar, 

43 Dict, 

44 Generator, 

45 Iterable, 

46 Iterator, 

47 Mapping, 

48 Optional, 

49 Set, 

50 Tuple, 

51 Union, 

52) 

53 

54# ----------------------------- 

55# Imports for other modules -- 

56from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension 

57from lsst.resources import ResourcePath, ResourcePathExpression 

58from lsst.utils import doImport 

59 

60from . import pipelineIR, pipeTools 

61from ._task_metadata import TaskMetadata 

62from .configOverrides import ConfigOverrides 

63from .connections import iterConnections 

64from .pipelineTask import PipelineTask 

65from .task import _TASK_METADATA_TYPE 

66 

67if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 67 ↛ 68line 67 didn't jump to line 68, because the condition on line 67 was never true

68 from lsst.obs.base import Instrument 

69 

70# ---------------------------------- 

71# Local non-exported definitions -- 

72# ---------------------------------- 

73 

74_LOG = logging.getLogger(__name__) 

75 

76# ------------------------ 

77# Exported definitions -- 

78# ------------------------ 

79 

80 

81@dataclass 

82class LabelSpecifier: 

83 """A structure to specify a subset of labels to load 

84 

85 This structure may contain a set of labels to be used in subsetting a 

86 pipeline, or a beginning and end point. Beginning or end may be empty, 

87 in which case the range will be a half open interval. Unlike python 

88 iteration bounds, end bounds are *INCLUDED*. Note that range based 

89 selection is not well defined for pipelines that are not linear in nature, 

90 and correct behavior is not guaranteed, or may vary from run to run. 

91 """ 

92 

93 labels: Optional[Set[str]] = None 

94 begin: Optional[str] = None 

95 end: Optional[str] = None 

96 

97 def __post_init__(self): 

98 if self.labels is not None and (self.begin or self.end): 

99 raise ValueError( 

100 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

101 ) 

102 

103 

104class TaskDef: 

105 """TaskDef is a collection of information about task needed by Pipeline. 

106 

107 The information includes task name, configuration object and optional 

108 task class. This class is just a collection of attributes and it exposes 

109 all of them so that attributes could potentially be modified in place 

110 (e.g. if configuration needs extra overrides). 

111 

112 Attributes 

113 ---------- 

114 taskName : `str`, optional 

115 `PipelineTask` class name, currently it is not specified whether this 

116 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 

117 Framework should be prepared to handle all cases. If not provided, 

118 ``taskClass`` must be, and ``taskClass.__name__`` is used. 

119 config : `lsst.pex.config.Config`, optional 

120 Instance of the configuration class corresponding to this task class, 

121 usually with all overrides applied. This config will be frozen. If 

122 not provided, ``taskClass`` must be provided and 

123 ``taskClass.ConfigClass()`` will be used. 

124 taskClass : `type`, optional 

125 `PipelineTask` class object, can be ``None``. If ``None`` then 

126 framework will have to locate and load class. 

127 label : `str`, optional 

128 Task label, usually a short string unique in a pipeline. If not 

129 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

130 be used. 

131 """ 

132 

133 def __init__(self, taskName=None, config=None, taskClass=None, label=None): 

134 if taskName is None: 

135 if taskClass is None: 

136 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

137 taskName = taskClass.__name__ 

138 if config is None: 

139 if taskClass is None: 

140 raise ValueError("`taskClass` must be provided if `config` is not.") 

141 config = taskClass.ConfigClass() 

142 if label is None: 

143 if taskClass is None: 

144 raise ValueError("`taskClass` must be provided if `label` is not.") 

145 label = taskClass._DefaultName 

146 self.taskName = taskName 

147 try: 

148 config.validate() 

149 except Exception: 

150 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

151 raise 

152 config.freeze() 

153 self.config = config 

154 self.taskClass = taskClass 

155 self.label = label 

156 self.connections = config.connections.ConnectionsClass(config=config) 

157 

158 @property 

159 def configDatasetName(self) -> str: 

160 """Name of a dataset type for configuration of this task (`str`)""" 

161 return self.label + "_config" 

162 

163 @property 

164 def metadataDatasetName(self) -> Optional[str]: 

165 """Name of a dataset type for metadata of this task, `None` if 

166 metadata is not to be saved (`str`) 

167 """ 

168 if self.config.saveMetadata: 

169 return self.label + "_metadata" 

170 else: 

171 return None 

172 

173 @property 

174 def logOutputDatasetName(self) -> Optional[str]: 

175 """Name of a dataset type for log output from this task, `None` if 

176 logs are not to be saved (`str`) 

177 """ 

178 if self.config.saveLogOutput: 

179 return self.label + "_log" 

180 else: 

181 return None 

182 

183 def __str__(self): 

184 rep = "TaskDef(" + self.taskName 

185 if self.label: 

186 rep += ", label=" + self.label 

187 rep += ")" 

188 return rep 

189 

190 def __eq__(self, other: object) -> bool: 

191 if not isinstance(other, TaskDef): 

192 return False 

193 # This does not consider equality of configs when determining equality 

194 # as config equality is a difficult thing to define. Should be updated 

195 # after DM-27847 

196 return self.taskClass == other.taskClass and self.label == other.label 

197 

198 def __hash__(self): 

199 return hash((self.taskClass, self.label)) 

200 

201 

202class Pipeline: 

203 """A `Pipeline` is a representation of a series of tasks to run, and the 

204 configuration for those tasks. 

205 

206 Parameters 

207 ---------- 

208 description : `str` 

209 A description of that this pipeline does. 

210 """ 

211 

212 def __init__(self, description: str): 

213 pipeline_dict = {"description": description, "tasks": {}} 

214 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

215 

216 @classmethod 

217 def fromFile(cls, filename: str) -> Pipeline: 

218 """Load a pipeline defined in a pipeline yaml file. 

219 

220 Parameters 

221 ---------- 

222 filename: `str` 

223 A path that points to a pipeline defined in yaml format. This 

224 filename may also supply additional labels to be used in 

225 subsetting the loaded Pipeline. These labels are separated from 

226 the path by a \\#, and may be specified as a comma separated 

227 list, or a range denoted as beginning..end. Beginning or end may 

228 be empty, in which case the range will be a half open interval. 

229 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

230 that range based selection is not well defined for pipelines that 

231 are not linear in nature, and correct behavior is not guaranteed, 

232 or may vary from run to run. 

233 

234 Returns 

235 ------- 

236 pipeline: `Pipeline` 

237 The pipeline loaded from specified location with appropriate (if 

238 any) subsetting 

239 

240 Notes 

241 ----- 

242 This method attempts to prune any contracts that contain labels which 

243 are not in the declared subset of labels. This pruning is done using a 

244 string based matching due to the nature of contracts and may prune more 

245 than it should. 

246 """ 

247 return cls.from_uri(filename) 

248 

249 @classmethod 

250 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

251 """Load a pipeline defined in a pipeline yaml file at a location 

252 specified by a URI. 

253 

254 Parameters 

255 ---------- 

256 uri: convertible to `ResourcePath` 

257 If a string is supplied this should be a URI path that points to a 

258 pipeline defined in yaml format, either as a direct path to the 

259 yaml file, or as a directory containing a "pipeline.yaml" file (the 

260 form used by `write_to_uri` with ``expand=True``). This uri may 

261 also supply additional labels to be used in subsetting the loaded 

262 Pipeline. These labels are separated from the path by a \\#, and 

263 may be specified as a comma separated list, or a range denoted as 

264 beginning..end. Beginning or end may be empty, in which case the 

265 range will be a half open interval. Unlike python iteration bounds, 

266 end bounds are *INCLUDED*. Note that range based selection is not 

267 well defined for pipelines that are not linear in nature, and 

268 correct behavior is not guaranteed, or may vary from run to run. 

269 The same specifiers can be used with a `ResourcePath` object, by 

270 being the sole contents in the fragments attribute. 

271 

272 Returns 

273 ------- 

274 pipeline: `Pipeline` 

275 The pipeline loaded from specified location with appropriate (if 

276 any) subsetting 

277 

278 Notes 

279 ----- 

280 This method attempts to prune any contracts that contain labels which 

281 are not in the declared subset of labels. This pruning is done using a 

282 string based matching due to the nature of contracts and may prune more 

283 than it should. 

284 """ 

285 # Split up the uri and any labels that were supplied 

286 uri, label_specifier = cls._parse_file_specifier(uri) 

287 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

288 

289 # If there are labels supplied, only keep those 

290 if label_specifier is not None: 

291 pipeline = pipeline.subsetFromLabels(label_specifier) 

292 return pipeline 

293 

294 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

295 """Subset a pipeline to contain only labels specified in labelSpecifier 

296 

297 Parameters 

298 ---------- 

299 labelSpecifier : `labelSpecifier` 

300 Object containing labels that describes how to subset a pipeline. 

301 

302 Returns 

303 ------- 

304 pipeline : `Pipeline` 

305 A new pipeline object that is a subset of the old pipeline 

306 

307 Raises 

308 ------ 

309 ValueError 

310 Raised if there is an issue with specified labels 

311 

312 Notes 

313 ----- 

314 This method attempts to prune any contracts that contain labels which 

315 are not in the declared subset of labels. This pruning is done using a 

316 string based matching due to the nature of contracts and may prune more 

317 than it should. 

318 """ 

319 # Labels supplied as a set 

320 if labelSpecifier.labels: 

321 labelSet = labelSpecifier.labels 

322 # Labels supplied as a range, first create a list of all the labels 

323 # in the pipeline sorted according to task dependency. Then only 

324 # keep labels that lie between the supplied bounds 

325 else: 

326 # Create a copy of the pipeline to use when assessing the label 

327 # ordering. Use a dict for fast searching while preserving order. 

328 # Remove contracts so they do not fail in the expansion step. This 

329 # is needed because a user may only configure the tasks they intend 

330 # to run, which may cause some contracts to fail if they will later 

331 # be dropped 

332 pipeline = copy.deepcopy(self) 

333 pipeline._pipelineIR.contracts = [] 

334 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

335 

336 # Verify the bounds are in the labels 

337 if labelSpecifier.begin is not None: 

338 if labelSpecifier.begin not in labels: 

339 raise ValueError( 

340 f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

341 "pipeline definition" 

342 ) 

343 if labelSpecifier.end is not None: 

344 if labelSpecifier.end not in labels: 

345 raise ValueError( 

346 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

347 ) 

348 

349 labelSet = set() 

350 for label in labels: 

351 if labelSpecifier.begin is not None: 

352 if label != labelSpecifier.begin: 

353 continue 

354 else: 

355 labelSpecifier.begin = None 

356 labelSet.add(label) 

357 if labelSpecifier.end is not None and label == labelSpecifier.end: 

358 break 

359 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

360 

361 @staticmethod 

362 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]: 

363 """Split appart a uri and any possible label subsets""" 

364 if isinstance(uri, str): 

365 # This is to support legacy pipelines during transition 

366 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

367 if num_replace: 

368 warnings.warn( 

369 f"The pipeline file {uri} seems to use the legacy : to separate " 

370 "labels, this is deprecated and will be removed after June 2021, please use " 

371 "# instead.", 

372 category=FutureWarning, 

373 ) 

374 if uri.count("#") > 1: 

375 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

376 # Everything else can be converted directly to ResourcePath. 

377 uri = ResourcePath(uri) 

378 label_subset = uri.fragment or None 

379 

380 specifier: Optional[LabelSpecifier] 

381 if label_subset is not None: 

382 label_subset = urllib.parse.unquote(label_subset) 

383 args: Dict[str, Union[Set[str], str, None]] 

384 # labels supplied as a list 

385 if "," in label_subset: 

386 if ".." in label_subset: 

387 raise ValueError( 

388 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

389 ) 

390 args = {"labels": set(label_subset.split(","))} 

391 # labels supplied as a range 

392 elif ".." in label_subset: 

393 # Try to de-structure the labelSubset, this will fail if more 

394 # than one range is specified 

395 begin, end, *rest = label_subset.split("..") 

396 if rest: 

397 raise ValueError("Only one range can be specified when loading a pipeline") 

398 args = {"begin": begin if begin else None, "end": end if end else None} 

399 # Assume anything else is a single label 

400 else: 

401 args = {"labels": {label_subset}} 

402 

403 specifier = LabelSpecifier(**args) 

404 else: 

405 specifier = None 

406 

407 return uri, specifier 

408 

409 @classmethod 

410 def fromString(cls, pipeline_string: str) -> Pipeline: 

411 """Create a pipeline from string formatted as a pipeline document. 

412 

413 Parameters 

414 ---------- 

415 pipeline_string : `str` 

416 A string that is formatted according like a pipeline document 

417 

418 Returns 

419 ------- 

420 pipeline: `Pipeline` 

421 """ 

422 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

423 return pipeline 

424 

425 @classmethod 

426 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

427 """Create a pipeline from an already created `PipelineIR` object. 

428 

429 Parameters 

430 ---------- 

431 deserialized_pipeline: `PipelineIR` 

432 An already created pipeline intermediate representation object 

433 

434 Returns 

435 ------- 

436 pipeline: `Pipeline` 

437 """ 

438 pipeline = cls.__new__(cls) 

439 pipeline._pipelineIR = deserialized_pipeline 

440 return pipeline 

441 

442 @classmethod 

443 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline: 

444 """Create a new pipeline by copying an already existing `Pipeline`. 

445 

446 Parameters 

447 ---------- 

448 pipeline: `Pipeline` 

449 An already created pipeline intermediate representation object 

450 

451 Returns 

452 ------- 

453 pipeline: `Pipeline` 

454 """ 

455 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

456 

457 def __str__(self) -> str: 

458 # tasks need sorted each call because someone might have added or 

459 # removed task, and caching changes does not seem worth the small 

460 # overhead 

461 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

462 self._pipelineIR.reorder_tasks(labels) 

463 return str(self._pipelineIR) 

464 

465 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

466 """Add an instrument to the pipeline, or replace an instrument that is 

467 already defined. 

468 

469 Parameters 

470 ---------- 

471 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

472 Either a derived class object of a `lsst.daf.butler.instrument` or 

473 a string corresponding to a fully qualified 

474 `lsst.daf.butler.instrument` name. 

475 """ 

476 if isinstance(instrument, str): 

477 pass 

478 else: 

479 # TODO: assume that this is a subclass of Instrument, no type 

480 # checking 

481 instrument = f"{instrument.__module__}.{instrument.__qualname__}" 

482 self._pipelineIR.instrument = instrument 

483 

484 def getInstrument(self) -> Instrument: 

485 """Get the instrument from the pipeline. 

486 

487 Returns 

488 ------- 

489 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None 

490 A derived class object of a `lsst.daf.butler.instrument`, a string 

491 corresponding to a fully qualified `lsst.daf.butler.instrument` 

492 name, or None if the pipeline does not have an instrument. 

493 """ 

494 return self._pipelineIR.instrument 

495 

496 def addTask(self, task: Union[PipelineTask, str], label: str) -> None: 

497 """Add a new task to the pipeline, or replace a task that is already 

498 associated with the supplied label. 

499 

500 Parameters 

501 ---------- 

502 task: `PipelineTask` or `str` 

503 Either a derived class object of a `PipelineTask` or a string 

504 corresponding to a fully qualified `PipelineTask` name. 

505 label: `str` 

506 A label that is used to identify the `PipelineTask` being added 

507 """ 

508 if isinstance(task, str): 

509 taskName = task 

510 elif issubclass(task, PipelineTask): 

511 taskName = f"{task.__module__}.{task.__qualname__}" 

512 else: 

513 raise ValueError( 

514 "task must be either a child class of PipelineTask or a string containing" 

515 " a fully qualified name to one" 

516 ) 

517 if not label: 

518 # in some cases (with command line-generated pipeline) tasks can 

519 # be defined without label which is not acceptable, use task 

520 # _DefaultName in that case 

521 if isinstance(task, str): 

522 task = doImport(task) 

523 label = task._DefaultName 

524 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

525 

526 def removeTask(self, label: str) -> None: 

527 """Remove a task from the pipeline. 

528 

529 Parameters 

530 ---------- 

531 label : `str` 

532 The label used to identify the task that is to be removed 

533 

534 Raises 

535 ------ 

536 KeyError 

537 If no task with that label exists in the pipeline 

538 

539 """ 

540 self._pipelineIR.tasks.pop(label) 

541 

542 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

543 """Apply single config override. 

544 

545 Parameters 

546 ---------- 

547 label : `str` 

548 Label of the task. 

549 key: `str` 

550 Fully-qualified field name. 

551 value : object 

552 Value to be given to a field. 

553 """ 

554 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

555 

556 def addConfigFile(self, label: str, filename: str) -> None: 

557 """Add overrides from a specified file. 

558 

559 Parameters 

560 ---------- 

561 label : `str` 

562 The label used to identify the task associated with config to 

563 modify 

564 filename : `str` 

565 Path to the override file. 

566 """ 

567 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

568 

569 def addConfigPython(self, label: str, pythonString: str) -> None: 

570 """Add Overrides by running a snippet of python code against a config. 

571 

572 Parameters 

573 ---------- 

574 label : `str` 

575 The label used to identity the task associated with config to 

576 modify. 

577 pythonString: `str` 

578 A string which is valid python code to be executed. This is done 

579 with config as the only local accessible value. 

580 """ 

581 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

582 

583 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

584 if label == "parameters": 

585 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

586 raise ValueError("Cannot override parameters that are not defined in pipeline") 

587 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

588 if newConfig.file: 

589 raise ValueError("Setting parameters section with config file is not supported") 

590 if newConfig.python: 

591 raise ValueError("Setting parameters section using python block in unsupported") 

592 return 

593 if label not in self._pipelineIR.tasks: 

594 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

595 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

596 

597 def toFile(self, filename: str) -> None: 

598 self._pipelineIR.to_file(filename) 

599 

600 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

601 """Write the pipeline to a file or directory. 

602 

603 Parameters 

604 ---------- 

605 uri : convertible to `ResourcePath` 

606 URI to write to; may have any scheme with `ResourcePath` write 

607 support or no scheme for a local file/directory. Should have a 

608 ``.yaml``. 

609 """ 

610 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

611 self._pipelineIR.reorder_tasks(labels) 

612 self._pipelineIR.write_to_uri(uri) 

613 

614 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

615 """Returns a generator of TaskDefs which can be used to create quantum 

616 graphs. 

617 

618 Returns 

619 ------- 

620 generator : generator of `TaskDef` 

621 The generator returned will be the sorted iterator of tasks which 

622 are to be used in constructing a quantum graph. 

623 

624 Raises 

625 ------ 

626 NotImplementedError 

627 If a dataId is supplied in a config block. This is in place for 

628 future use 

629 """ 

630 yield from self._toExpandedPipelineImpl() 

631 

632 def _toExpandedPipelineImpl(self, checkContracts=True) -> Iterable[TaskDef]: 

633 taskDefs = [] 

634 for label in self._pipelineIR.tasks: 

635 taskDefs.append(self._buildTaskDef(label)) 

636 

637 # lets evaluate the contracts 

638 if self._pipelineIR.contracts is not None: 

639 label_to_config = {x.label: x.config for x in taskDefs} 

640 for contract in self._pipelineIR.contracts: 

641 # execute this in its own line so it can raise a good error 

642 # message if there was problems with the eval 

643 success = eval(contract.contract, None, label_to_config) 

644 if not success: 

645 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

646 raise pipelineIR.ContractError( 

647 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

648 ) 

649 

650 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

651 yield from pipeTools.orderPipeline(taskDefs) 

652 

653 def _buildTaskDef(self, label: str) -> TaskDef: 

654 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

655 raise NameError(f"Label {label} does not appear in this pipeline") 

656 taskClass = doImport(taskIR.klass) 

657 taskName = taskClass.__qualname__ 

658 config = taskClass.ConfigClass() 

659 overrides = ConfigOverrides() 

660 if self._pipelineIR.instrument is not None: 

661 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

662 if taskIR.config is not None: 

663 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config): 

664 if configIR.dataId is not None: 

665 raise NotImplementedError( 

666 "Specializing a config on a partial data id is not yet " 

667 "supported in Pipeline definition" 

668 ) 

669 # only apply override if it applies to everything 

670 if configIR.dataId is None: 

671 if configIR.file: 

672 for configFile in configIR.file: 

673 overrides.addFileOverride(os.path.expandvars(configFile)) 

674 if configIR.python is not None: 

675 overrides.addPythonOverride(configIR.python) 

676 for key, value in configIR.rest.items(): 

677 overrides.addValueOverride(key, value) 

678 overrides.applyTo(config) 

679 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

680 

681 def __iter__(self) -> Generator[TaskDef, None, None]: 

682 return self.toExpandedPipeline() 

683 

684 def __getitem__(self, item: str) -> TaskDef: 

685 return self._buildTaskDef(item) 

686 

687 def __len__(self): 

688 return len(self._pipelineIR.tasks) 

689 

690 def __eq__(self, other: object): 

691 if not isinstance(other, Pipeline): 

692 return False 

693 return self._pipelineIR == other._pipelineIR 

694 

695 

696@dataclass(frozen=True) 

697class TaskDatasetTypes: 

698 """An immutable struct that extracts and classifies the dataset types used 

699 by a `PipelineTask` 

700 """ 

701 

702 initInputs: NamedValueSet[DatasetType] 

703 """Dataset types that are needed as inputs in order to construct this Task. 

704 

705 Task-level `initInputs` may be classified as either 

706 `~PipelineDatasetTypes.initInputs` or 

707 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

708 """ 

709 

710 initOutputs: NamedValueSet[DatasetType] 

711 """Dataset types that may be written after constructing this Task. 

712 

713 Task-level `initOutputs` may be classified as either 

714 `~PipelineDatasetTypes.initOutputs` or 

715 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

716 """ 

717 

718 inputs: NamedValueSet[DatasetType] 

719 """Dataset types that are regular inputs to this Task. 

720 

721 If an input dataset needed for a Quantum cannot be found in the input 

722 collection(s) or produced by another Task in the Pipeline, that Quantum 

723 (and all dependent Quanta) will not be produced. 

724 

725 Task-level `inputs` may be classified as either 

726 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

727 at the Pipeline level. 

728 """ 

729 

730 prerequisites: NamedValueSet[DatasetType] 

731 """Dataset types that are prerequisite inputs to this Task. 

732 

733 Prerequisite inputs must exist in the input collection(s) before the 

734 pipeline is run, but do not constrain the graph - if a prerequisite is 

735 missing for a Quantum, `PrerequisiteMissingError` is raised. 

736 

737 Prerequisite inputs are not resolved until the second stage of 

738 QuantumGraph generation. 

739 """ 

740 

741 outputs: NamedValueSet[DatasetType] 

742 """Dataset types that are produced by this Task. 

743 

744 Task-level `outputs` may be classified as either 

745 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

746 at the Pipeline level. 

747 """ 

748 

749 @classmethod 

750 def fromTaskDef( 

751 cls, 

752 taskDef: TaskDef, 

753 *, 

754 registry: Registry, 

755 include_configs: bool = True, 

756 storage_class_mapping: Optional[Mapping[str, str]] = None, 

757 ) -> TaskDatasetTypes: 

758 """Extract and classify the dataset types from a single `PipelineTask`. 

759 

760 Parameters 

761 ---------- 

762 taskDef: `TaskDef` 

763 An instance of a `TaskDef` class for a particular `PipelineTask`. 

764 registry: `Registry` 

765 Registry used to construct normalized `DatasetType` objects and 

766 retrieve those that are incomplete. 

767 include_configs : `bool`, optional 

768 If `True` (default) include config dataset types as 

769 ``initOutputs``. 

770 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

771 If a taskdef contains a component dataset type that is unknown 

772 to the registry, its parent StorageClass will be looked up in this 

773 mapping if it is supplied. If the mapping does not contain the 

774 composite dataset type, or the mapping is not supplied an exception 

775 will be raised. 

776 

777 Returns 

778 ------- 

779 types: `TaskDatasetTypes` 

780 The dataset types used by this task. 

781 

782 Raises 

783 ------ 

784 ValueError 

785 Raised if dataset type connection definition differs from 

786 registry definition. 

787 LookupError 

788 Raised if component parent StorageClass could not be determined 

789 and storage_class_mapping does not contain the composite type, or 

790 is set to None. 

791 """ 

792 

793 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]: 

794 """Constructs a set of true `DatasetType` objects 

795 

796 Parameters 

797 ---------- 

798 connectionType : `str` 

799 Name of the connection type to produce a set for, corresponds 

800 to an attribute of type `list` on the connection class instance 

801 freeze : `bool`, optional 

802 If `True`, call `NamedValueSet.freeze` on the object returned. 

803 

804 Returns 

805 ------- 

806 datasetTypes : `NamedValueSet` 

807 A set of all datasetTypes which correspond to the input 

808 connection type specified in the connection class of this 

809 `PipelineTask` 

810 

811 Raises 

812 ------ 

813 ValueError 

814 Raised if dataset type connection definition differs from 

815 registry definition. 

816 LookupError 

817 Raised if component parent StorageClass could not be determined 

818 and storage_class_mapping does not contain the composite type, 

819 or is set to None. 

820 

821 Notes 

822 ----- 

823 This function is a closure over the variables ``registry`` and 

824 ``taskDef``, and ``storage_class_mapping``. 

825 """ 

826 datasetTypes = NamedValueSet() 

827 for c in iterConnections(taskDef.connections, connectionType): 

828 dimensions = set(getattr(c, "dimensions", set())) 

829 if "skypix" in dimensions: 

830 try: 

831 datasetType = registry.getDatasetType(c.name) 

832 except LookupError as err: 

833 raise LookupError( 

834 f"DatasetType '{c.name}' referenced by " 

835 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

836 f"placeholder, but does not already exist in the registry. " 

837 f"Note that reference catalog names are now used as the dataset " 

838 f"type name instead of 'ref_cat'." 

839 ) from err 

840 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

841 rest2 = set( 

842 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

843 ) 

844 if rest1 != rest2: 

845 raise ValueError( 

846 f"Non-skypix dimensions for dataset type {c.name} declared in " 

847 f"connections ({rest1}) are inconsistent with those in " 

848 f"registry's version of this dataset ({rest2})." 

849 ) 

850 else: 

851 # Component dataset types are not explicitly in the 

852 # registry. This complicates consistency checks with 

853 # registry and requires we work out the composite storage 

854 # class. 

855 registryDatasetType = None 

856 try: 

857 registryDatasetType = registry.getDatasetType(c.name) 

858 except KeyError: 

859 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

860 if componentName: 

861 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

862 raise LookupError( 

863 "Component parent class cannot be determined, and " 

864 "composite name was not in storage class mapping, or no " 

865 "storage_class_mapping was supplied" 

866 ) 

867 else: 

868 parentStorageClass = storage_class_mapping[compositeName] 

869 else: 

870 parentStorageClass = None 

871 datasetType = c.makeDatasetType( 

872 registry.dimensions, parentStorageClass=parentStorageClass 

873 ) 

874 registryDatasetType = datasetType 

875 else: 

876 datasetType = c.makeDatasetType( 

877 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

878 ) 

879 

880 if registryDatasetType and datasetType != registryDatasetType: 

881 try: 

882 # Explicitly check for storage class just to make 

883 # more specific message. 

884 _ = datasetType.storageClass 

885 except KeyError: 

886 raise ValueError( 

887 "Storage class does not exist for supplied dataset type " 

888 f"{datasetType} for {taskDef.label}." 

889 ) from None 

890 raise ValueError( 

891 f"Supplied dataset type ({datasetType}) inconsistent with " 

892 f"registry definition ({registryDatasetType}) " 

893 f"for {taskDef.label}." 

894 ) 

895 datasetTypes.add(datasetType) 

896 if freeze: 

897 datasetTypes.freeze() 

898 return datasetTypes 

899 

900 # optionally add initOutput dataset for config 

901 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False) 

902 if include_configs: 

903 initOutputs.add( 

904 DatasetType( 

905 taskDef.configDatasetName, 

906 registry.dimensions.empty, 

907 storageClass="Config", 

908 ) 

909 ) 

910 initOutputs.freeze() 

911 

912 # optionally add output dataset for metadata 

913 outputs = makeDatasetTypesSet("outputs", freeze=False) 

914 if taskDef.metadataDatasetName is not None: 

915 # Metadata is supposed to be of the TaskMetadata type, its 

916 # dimensions correspond to a task quantum. 

917 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

918 

919 # Allow the storage class definition to be read from the existing 

920 # dataset type definition if present. 

921 try: 

922 current = registry.getDatasetType(taskDef.metadataDatasetName) 

923 except KeyError: 

924 # No previous definition so use the default. 

925 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

926 else: 

927 storageClass = current.storageClass.name 

928 

929 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)} 

930 if taskDef.logOutputDatasetName is not None: 

931 # Log output dimensions correspond to a task quantum. 

932 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

933 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")} 

934 

935 outputs.freeze() 

936 

937 return cls( 

938 initInputs=makeDatasetTypesSet("initInputs"), 

939 initOutputs=initOutputs, 

940 inputs=makeDatasetTypesSet("inputs"), 

941 prerequisites=makeDatasetTypesSet("prerequisiteInputs"), 

942 outputs=outputs, 

943 ) 

944 

945 

946@dataclass(frozen=True) 

947class PipelineDatasetTypes: 

948 """An immutable struct that classifies the dataset types used in a 

949 `Pipeline`. 

950 """ 

951 

952 packagesDatasetName: ClassVar[str] = "packages" 

953 """Name of a dataset type used to save package versions. 

954 """ 

955 

956 initInputs: NamedValueSet[DatasetType] 

957 """Dataset types that are needed as inputs in order to construct the Tasks 

958 in this Pipeline. 

959 

960 This does not include dataset types that are produced when constructing 

961 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

962 """ 

963 

964 initOutputs: NamedValueSet[DatasetType] 

965 """Dataset types that may be written after constructing the Tasks in this 

966 Pipeline. 

967 

968 This does not include dataset types that are also used as inputs when 

969 constructing other Tasks in the Pipeline (these are classified as 

970 `initIntermediates`). 

971 """ 

972 

973 initIntermediates: NamedValueSet[DatasetType] 

974 """Dataset types that are both used when constructing one or more Tasks 

975 in the Pipeline and produced as a side-effect of constructing another 

976 Task in the Pipeline. 

977 """ 

978 

979 inputs: NamedValueSet[DatasetType] 

980 """Dataset types that are regular inputs for the full pipeline. 

981 

982 If an input dataset needed for a Quantum cannot be found in the input 

983 collection(s), that Quantum (and all dependent Quanta) will not be 

984 produced. 

985 """ 

986 

987 prerequisites: NamedValueSet[DatasetType] 

988 """Dataset types that are prerequisite inputs for the full Pipeline. 

989 

990 Prerequisite inputs must exist in the input collection(s) before the 

991 pipeline is run, but do not constrain the graph - if a prerequisite is 

992 missing for a Quantum, `PrerequisiteMissingError` is raised. 

993 

994 Prerequisite inputs are not resolved until the second stage of 

995 QuantumGraph generation. 

996 """ 

997 

998 intermediates: NamedValueSet[DatasetType] 

999 """Dataset types that are output by one Task in the Pipeline and consumed 

1000 as inputs by one or more other Tasks in the Pipeline. 

1001 """ 

1002 

1003 outputs: NamedValueSet[DatasetType] 

1004 """Dataset types that are output by a Task in the Pipeline and not consumed 

1005 by any other Task in the Pipeline. 

1006 """ 

1007 

1008 byTask: Mapping[str, TaskDatasetTypes] 

1009 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1010 

1011 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1012 neither has been modified since the dataset types were extracted, of 

1013 course). 

1014 """ 

1015 

1016 @classmethod 

1017 def fromPipeline( 

1018 cls, 

1019 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1020 *, 

1021 registry: Registry, 

1022 include_configs: bool = True, 

1023 include_packages: bool = True, 

1024 ) -> PipelineDatasetTypes: 

1025 """Extract and classify the dataset types from all tasks in a 

1026 `Pipeline`. 

1027 

1028 Parameters 

1029 ---------- 

1030 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1031 A collection of tasks that can be run together. 

1032 registry: `Registry` 

1033 Registry used to construct normalized `DatasetType` objects and 

1034 retrieve those that are incomplete. 

1035 include_configs : `bool`, optional 

1036 If `True` (default) include config dataset types as 

1037 ``initOutputs``. 

1038 include_packages : `bool`, optional 

1039 If `True` (default) include the dataset type for software package 

1040 versions in ``initOutputs``. 

1041 

1042 Returns 

1043 ------- 

1044 types: `PipelineDatasetTypes` 

1045 The dataset types used by this `Pipeline`. 

1046 

1047 Raises 

1048 ------ 

1049 ValueError 

1050 Raised if Tasks are inconsistent about which datasets are marked 

1051 prerequisite. This indicates that the Tasks cannot be run as part 

1052 of the same `Pipeline`. 

1053 """ 

1054 allInputs = NamedValueSet() 

1055 allOutputs = NamedValueSet() 

1056 allInitInputs = NamedValueSet() 

1057 allInitOutputs = NamedValueSet() 

1058 prerequisites = NamedValueSet() 

1059 byTask = dict() 

1060 if include_packages: 

1061 allInitOutputs.add( 

1062 DatasetType( 

1063 cls.packagesDatasetName, 

1064 registry.dimensions.empty, 

1065 storageClass="Packages", 

1066 ) 

1067 ) 

1068 # create a list of TaskDefs in case the input is a generator 

1069 pipeline = list(pipeline) 

1070 

1071 # collect all the output dataset types 

1072 typeStorageclassMap: Dict[str, str] = {} 

1073 for taskDef in pipeline: 

1074 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1075 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1076 

1077 for taskDef in pipeline: 

1078 thisTask = TaskDatasetTypes.fromTaskDef( 

1079 taskDef, 

1080 registry=registry, 

1081 include_configs=include_configs, 

1082 storage_class_mapping=typeStorageclassMap, 

1083 ) 

1084 allInitInputs |= thisTask.initInputs 

1085 allInitOutputs |= thisTask.initOutputs 

1086 allInputs |= thisTask.inputs 

1087 prerequisites |= thisTask.prerequisites 

1088 allOutputs |= thisTask.outputs 

1089 byTask[taskDef.label] = thisTask 

1090 if not prerequisites.isdisjoint(allInputs): 

1091 raise ValueError( 

1092 "{} marked as both prerequisites and regular inputs".format( 

1093 {dt.name for dt in allInputs & prerequisites} 

1094 ) 

1095 ) 

1096 if not prerequisites.isdisjoint(allOutputs): 

1097 raise ValueError( 

1098 "{} marked as both prerequisites and outputs".format( 

1099 {dt.name for dt in allOutputs & prerequisites} 

1100 ) 

1101 ) 

1102 # Make sure that components which are marked as inputs get treated as 

1103 # intermediates if there is an output which produces the composite 

1104 # containing the component 

1105 intermediateComponents = NamedValueSet() 

1106 intermediateComposites = NamedValueSet() 

1107 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1108 for dsType in allInputs: 

1109 # get the name of a possible component 

1110 name, component = dsType.nameAndComponent() 

1111 # if there is a component name, that means this is a component 

1112 # DatasetType, if there is an output which produces the parent of 

1113 # this component, treat this input as an intermediate 

1114 if component is not None: 

1115 # This needs to be in this if block, because someone might have 

1116 # a composite that is a pure input from existing data 

1117 if name in outputNameMapping: 

1118 intermediateComponents.add(dsType) 

1119 intermediateComposites.add(outputNameMapping[name]) 

1120 

1121 def checkConsistency(a: NamedValueSet, b: NamedValueSet): 

1122 common = a.names & b.names 

1123 for name in common: 

1124 if a[name] != b[name]: 

1125 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1126 

1127 checkConsistency(allInitInputs, allInitOutputs) 

1128 checkConsistency(allInputs, allOutputs) 

1129 checkConsistency(allInputs, intermediateComposites) 

1130 checkConsistency(allOutputs, intermediateComposites) 

1131 

1132 def frozen(s: NamedValueSet) -> NamedValueSet: 

1133 s.freeze() 

1134 return s 

1135 

1136 return cls( 

1137 initInputs=frozen(allInitInputs - allInitOutputs), 

1138 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1139 initOutputs=frozen(allInitOutputs - allInitInputs), 

1140 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1141 intermediates=frozen(allInputs & allOutputs | intermediateComponents), 

1142 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1143 prerequisites=frozen(prerequisites), 

1144 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1145 ) 

1146 

1147 @classmethod 

1148 def initOutputNames( 

1149 cls, 

1150 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1151 *, 

1152 include_configs: bool = True, 

1153 include_packages: bool = True, 

1154 ) -> Iterator[str]: 

1155 """Return the names of dataset types ot task initOutputs, Configs, 

1156 and package versions for a pipeline. 

1157 

1158 Parameters 

1159 ---------- 

1160 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1161 A `Pipeline` instance or collection of `TaskDef` instances. 

1162 include_configs : `bool`, optional 

1163 If `True` (default) include config dataset types. 

1164 include_packages : `bool`, optional 

1165 If `True` (default) include the dataset type for package versions. 

1166 

1167 Yields 

1168 ------ 

1169 datasetTypeName : `str` 

1170 Name of the dataset type. 

1171 """ 

1172 if include_packages: 

1173 # Package versions dataset type 

1174 yield cls.packagesDatasetName 

1175 

1176 if isinstance(pipeline, Pipeline): 

1177 pipeline = pipeline.toExpandedPipeline() 

1178 

1179 for taskDef in pipeline: 

1180 

1181 # all task InitOutputs 

1182 for name in taskDef.connections.initOutputs: 

1183 attribute = getattr(taskDef.connections, name) 

1184 yield attribute.name 

1185 

1186 # config dataset name 

1187 if include_configs: 

1188 yield taskDef.configDatasetName