Coverage for python/lsst/pipe/base/pipeline.py: 19%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

384 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31from dataclasses import dataclass 

32import logging 

33from types import MappingProxyType 

34from typing import (ClassVar, Dict, Iterable, Iterator, Mapping, Set, Union, 

35 Generator, TYPE_CHECKING, Optional, Tuple) 

36 

37import copy 

38import re 

39import os 

40import urllib.parse 

41import warnings 

42 

43# ----------------------------- 

44# Imports for other modules -- 

45from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension, ButlerURI 

46from lsst.utils import doImport 

47from .configOverrides import ConfigOverrides 

48from .connections import iterConnections 

49from .pipelineTask import PipelineTask 

50from .task import _TASK_METADATA_TYPE 

51from ._task_metadata import TaskMetadata 

52 

53from . import pipelineIR 

54from . import pipeTools 

55 

56if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true

57 from lsst.obs.base import Instrument 

58 

59# ---------------------------------- 

60# Local non-exported definitions -- 

61# ---------------------------------- 

62 

63_LOG = logging.getLogger(__name__) 

64 

65# ------------------------ 

66# Exported definitions -- 

67# ------------------------ 

68 

69 

70@dataclass 

71class LabelSpecifier: 

72 """A structure to specify a subset of labels to load 

73 

74 This structure may contain a set of labels to be used in subsetting a 

75 pipeline, or a beginning and end point. Beginning or end may be empty, 

76 in which case the range will be a half open interval. Unlike python 

77 iteration bounds, end bounds are *INCLUDED*. Note that range based 

78 selection is not well defined for pipelines that are not linear in nature, 

79 and correct behavior is not guaranteed, or may vary from run to run. 

80 """ 

81 labels: Optional[Set[str]] = None 

82 begin: Optional[str] = None 

83 end: Optional[str] = None 

84 

85 def __post_init__(self): 

86 if self.labels is not None and (self.begin or self.end): 

87 raise ValueError("This struct can only be initialized with a labels set or " 

88 "a begin (and/or) end specifier") 

89 

90 

91class TaskDef: 

92 """TaskDef is a collection of information about task needed by Pipeline. 

93 

94 The information includes task name, configuration object and optional 

95 task class. This class is just a collection of attributes and it exposes 

96 all of them so that attributes could potentially be modified in place 

97 (e.g. if configuration needs extra overrides). 

98 

99 Attributes 

100 ---------- 

101 taskName : `str`, optional 

102 `PipelineTask` class name, currently it is not specified whether this 

103 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 

104 Framework should be prepared to handle all cases. If not provided, 

105 ``taskClass`` must be, and ``taskClass.__name__`` is used. 

106 config : `lsst.pex.config.Config`, optional 

107 Instance of the configuration class corresponding to this task class, 

108 usually with all overrides applied. This config will be frozen. If 

109 not provided, ``taskClass`` must be provided and 

110 ``taskClass.ConfigClass()`` will be used. 

111 taskClass : `type`, optional 

112 `PipelineTask` class object, can be ``None``. If ``None`` then 

113 framework will have to locate and load class. 

114 label : `str`, optional 

115 Task label, usually a short string unique in a pipeline. If not 

116 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

117 be used. 

118 """ 

119 def __init__(self, taskName=None, config=None, taskClass=None, label=None): 

120 if taskName is None: 

121 if taskClass is None: 

122 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

123 taskName = taskClass.__name__ 

124 if config is None: 

125 if taskClass is None: 

126 raise ValueError("`taskClass` must be provided if `config` is not.") 

127 config = taskClass.ConfigClass() 

128 if label is None: 

129 if taskClass is None: 

130 raise ValueError("`taskClass` must be provided if `label` is not.") 

131 label = taskClass._DefaultName 

132 self.taskName = taskName 

133 try: 

134 config.validate() 

135 except Exception: 

136 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

137 raise 

138 config.freeze() 

139 self.config = config 

140 self.taskClass = taskClass 

141 self.label = label 

142 self.connections = config.connections.ConnectionsClass(config=config) 

143 

144 @property 

145 def configDatasetName(self) -> str: 

146 """Name of a dataset type for configuration of this task (`str`) 

147 """ 

148 return self.label + "_config" 

149 

150 @property 

151 def metadataDatasetName(self) -> Optional[str]: 

152 """Name of a dataset type for metadata of this task, `None` if 

153 metadata is not to be saved (`str`) 

154 """ 

155 if self.config.saveMetadata: 

156 return self.label + "_metadata" 

157 else: 

158 return None 

159 

160 @property 

161 def logOutputDatasetName(self) -> Optional[str]: 

162 """Name of a dataset type for log output from this task, `None` if 

163 logs are not to be saved (`str`) 

164 """ 

165 if self.config.saveLogOutput: 

166 return self.label + "_log" 

167 else: 

168 return None 

169 

170 def __str__(self): 

171 rep = "TaskDef(" + self.taskName 

172 if self.label: 

173 rep += ", label=" + self.label 

174 rep += ")" 

175 return rep 

176 

177 def __eq__(self, other: object) -> bool: 

178 if not isinstance(other, TaskDef): 

179 return False 

180 # This does not consider equality of configs when determining equality 

181 # as config equality is a difficult thing to define. Should be updated 

182 # after DM-27847 

183 return self.taskClass == other.taskClass and self.label == other.label 

184 

185 def __hash__(self): 

186 return hash((self.taskClass, self.label)) 

187 

188 

189class Pipeline: 

190 """A `Pipeline` is a representation of a series of tasks to run, and the 

191 configuration for those tasks. 

192 

193 Parameters 

194 ---------- 

195 description : `str` 

196 A description of that this pipeline does. 

197 """ 

198 def __init__(self, description: str): 

199 pipeline_dict = {"description": description, "tasks": {}} 

200 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

201 

202 @classmethod 

203 def fromFile(cls, filename: str) -> Pipeline: 

204 """Load a pipeline defined in a pipeline yaml file. 

205 

206 Parameters 

207 ---------- 

208 filename: `str` 

209 A path that points to a pipeline defined in yaml format. This 

210 filename may also supply additional labels to be used in 

211 subsetting the loaded Pipeline. These labels are separated from 

212 the path by a \\#, and may be specified as a comma separated 

213 list, or a range denoted as beginning..end. Beginning or end may 

214 be empty, in which case the range will be a half open interval. 

215 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

216 that range based selection is not well defined for pipelines that 

217 are not linear in nature, and correct behavior is not guaranteed, 

218 or may vary from run to run. 

219 

220 Returns 

221 ------- 

222 pipeline: `Pipeline` 

223 The pipeline loaded from specified location with appropriate (if 

224 any) subsetting 

225 

226 Notes 

227 ----- 

228 This method attempts to prune any contracts that contain labels which 

229 are not in the declared subset of labels. This pruning is done using a 

230 string based matching due to the nature of contracts and may prune more 

231 than it should. 

232 """ 

233 return cls.from_uri(filename) 

234 

235 @classmethod 

236 def from_uri(cls, uri: Union[str, ButlerURI]) -> Pipeline: 

237 """Load a pipeline defined in a pipeline yaml file at a location 

238 specified by a URI. 

239 

240 Parameters 

241 ---------- 

242 uri: `str` or `ButlerURI` 

243 If a string is supplied this should be a URI path that points to a 

244 pipeline defined in yaml format. This uri may also supply 

245 additional labels to be used in subsetting the loaded Pipeline. 

246 These labels are separated from the path by a \\#, and may be 

247 specified as a comma separated list, or a range denoted as 

248 beginning..end. Beginning or end may be empty, in which case the 

249 range will be a half open interval. Unlike python iteration 

250 bounds, end bounds are *INCLUDED*. Note that range based selection 

251 is not well defined for pipelines that are not linear in nature, 

252 and correct behavior is not guaranteed, or may vary from run to 

253 run. The same specifiers can be used with a ButlerURI object, by 

254 being the sole contents in the fragments attribute. 

255 

256 Returns 

257 ------- 

258 pipeline: `Pipeline` 

259 The pipeline loaded from specified location with appropriate (if 

260 any) subsetting 

261 

262 Notes 

263 ----- 

264 This method attempts to prune any contracts that contain labels which 

265 are not in the declared subset of labels. This pruning is done using a 

266 string based matching due to the nature of contracts and may prune more 

267 than it should. 

268 """ 

269 # Split up the uri and any labels that were supplied 

270 uri, label_specifier = cls._parse_file_specifier(uri) 

271 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

272 

273 # If there are labels supplied, only keep those 

274 if label_specifier is not None: 

275 pipeline = pipeline.subsetFromLabels(label_specifier) 

276 return pipeline 

277 

278 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

279 """Subset a pipeline to contain only labels specified in labelSpecifier 

280 

281 Parameters 

282 ---------- 

283 labelSpecifier : `labelSpecifier` 

284 Object containing labels that describes how to subset a pipeline. 

285 

286 Returns 

287 ------- 

288 pipeline : `Pipeline` 

289 A new pipeline object that is a subset of the old pipeline 

290 

291 Raises 

292 ------ 

293 ValueError 

294 Raised if there is an issue with specified labels 

295 

296 Notes 

297 ----- 

298 This method attempts to prune any contracts that contain labels which 

299 are not in the declared subset of labels. This pruning is done using a 

300 string based matching due to the nature of contracts and may prune more 

301 than it should. 

302 """ 

303 # Labels supplied as a set 

304 if labelSpecifier.labels: 

305 labelSet = labelSpecifier.labels 

306 # Labels supplied as a range, first create a list of all the labels 

307 # in the pipeline sorted according to task dependency. Then only 

308 # keep labels that lie between the supplied bounds 

309 else: 

310 # Create a copy of the pipeline to use when assessing the label 

311 # ordering. Use a dict for fast searching while preserving order. 

312 # Remove contracts so they do not fail in the expansion step. This 

313 # is needed because a user may only configure the tasks they intend 

314 # to run, which may cause some contracts to fail if they will later 

315 # be dropped 

316 pipeline = copy.deepcopy(self) 

317 pipeline._pipelineIR.contracts = [] 

318 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

319 

320 # Verify the bounds are in the labels 

321 if labelSpecifier.begin is not None: 

322 if labelSpecifier.begin not in labels: 

323 raise ValueError(f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

324 "pipeline definition") 

325 if labelSpecifier.end is not None: 

326 if labelSpecifier.end not in labels: 

327 raise ValueError(f"End of range subset, {labelSpecifier.end}, not found in pipeline " 

328 "definition") 

329 

330 labelSet = set() 

331 for label in labels: 

332 if labelSpecifier.begin is not None: 

333 if label != labelSpecifier.begin: 

334 continue 

335 else: 

336 labelSpecifier.begin = None 

337 labelSet.add(label) 

338 if labelSpecifier.end is not None and label == labelSpecifier.end: 

339 break 

340 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

341 

342 @staticmethod 

343 def _parse_file_specifier(uri: Union[str, ButlerURI] 

344 ) -> Tuple[ButlerURI, Optional[LabelSpecifier]]: 

345 """Split appart a uri and any possible label subsets 

346 """ 

347 if isinstance(uri, str): 

348 # This is to support legacy pipelines during transition 

349 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

350 if num_replace: 

351 warnings.warn(f"The pipeline file {uri} seems to use the legacy : to separate " 

352 "labels, this is deprecated and will be removed after June 2021, please use " 

353 "# instead.", 

354 category=FutureWarning) 

355 if uri.count("#") > 1: 

356 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

357 uri = ButlerURI(uri) 

358 label_subset = uri.fragment or None 

359 

360 specifier: Optional[LabelSpecifier] 

361 if label_subset is not None: 

362 label_subset = urllib.parse.unquote(label_subset) 

363 args: Dict[str, Union[Set[str], str, None]] 

364 # labels supplied as a list 

365 if ',' in label_subset: 

366 if '..' in label_subset: 

367 raise ValueError("Can only specify a list of labels or a range" 

368 "when loading a Pipline not both") 

369 args = {"labels": set(label_subset.split(","))} 

370 # labels supplied as a range 

371 elif '..' in label_subset: 

372 # Try to de-structure the labelSubset, this will fail if more 

373 # than one range is specified 

374 begin, end, *rest = label_subset.split("..") 

375 if rest: 

376 raise ValueError("Only one range can be specified when loading a pipeline") 

377 args = {"begin": begin if begin else None, "end": end if end else None} 

378 # Assume anything else is a single label 

379 else: 

380 args = {"labels": {label_subset}} 

381 

382 specifier = LabelSpecifier(**args) 

383 else: 

384 specifier = None 

385 

386 return uri, specifier 

387 

388 @classmethod 

389 def fromString(cls, pipeline_string: str) -> Pipeline: 

390 """Create a pipeline from string formatted as a pipeline document. 

391 

392 Parameters 

393 ---------- 

394 pipeline_string : `str` 

395 A string that is formatted according like a pipeline document 

396 

397 Returns 

398 ------- 

399 pipeline: `Pipeline` 

400 """ 

401 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

402 return pipeline 

403 

404 @classmethod 

405 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

406 """Create a pipeline from an already created `PipelineIR` object. 

407 

408 Parameters 

409 ---------- 

410 deserialized_pipeline: `PipelineIR` 

411 An already created pipeline intermediate representation object 

412 

413 Returns 

414 ------- 

415 pipeline: `Pipeline` 

416 """ 

417 pipeline = cls.__new__(cls) 

418 pipeline._pipelineIR = deserialized_pipeline 

419 return pipeline 

420 

421 @classmethod 

422 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline: 

423 """Create a new pipeline by copying an already existing `Pipeline`. 

424 

425 Parameters 

426 ---------- 

427 pipeline: `Pipeline` 

428 An already created pipeline intermediate representation object 

429 

430 Returns 

431 ------- 

432 pipeline: `Pipeline` 

433 """ 

434 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

435 

436 def __str__(self) -> str: 

437 # tasks need sorted each call because someone might have added or 

438 # removed task, and caching changes does not seem worth the small 

439 # overhead 

440 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

441 self._pipelineIR.reorder_tasks(labels) 

442 return str(self._pipelineIR) 

443 

444 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

445 """Add an instrument to the pipeline, or replace an instrument that is 

446 already defined. 

447 

448 Parameters 

449 ---------- 

450 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

451 Either a derived class object of a `lsst.daf.butler.instrument` or 

452 a string corresponding to a fully qualified 

453 `lsst.daf.butler.instrument` name. 

454 """ 

455 if isinstance(instrument, str): 

456 pass 

457 else: 

458 # TODO: assume that this is a subclass of Instrument, no type 

459 # checking 

460 instrument = f"{instrument.__module__}.{instrument.__qualname__}" 

461 self._pipelineIR.instrument = instrument 

462 

463 def getInstrument(self) -> Instrument: 

464 """Get the instrument from the pipeline. 

465 

466 Returns 

467 ------- 

468 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None 

469 A derived class object of a `lsst.daf.butler.instrument`, a string 

470 corresponding to a fully qualified `lsst.daf.butler.instrument` 

471 name, or None if the pipeline does not have an instrument. 

472 """ 

473 return self._pipelineIR.instrument 

474 

475 def addTask(self, task: Union[PipelineTask, str], label: str) -> None: 

476 """Add a new task to the pipeline, or replace a task that is already 

477 associated with the supplied label. 

478 

479 Parameters 

480 ---------- 

481 task: `PipelineTask` or `str` 

482 Either a derived class object of a `PipelineTask` or a string 

483 corresponding to a fully qualified `PipelineTask` name. 

484 label: `str` 

485 A label that is used to identify the `PipelineTask` being added 

486 """ 

487 if isinstance(task, str): 

488 taskName = task 

489 elif issubclass(task, PipelineTask): 

490 taskName = f"{task.__module__}.{task.__qualname__}" 

491 else: 

492 raise ValueError("task must be either a child class of PipelineTask or a string containing" 

493 " a fully qualified name to one") 

494 if not label: 

495 # in some cases (with command line-generated pipeline) tasks can 

496 # be defined without label which is not acceptable, use task 

497 # _DefaultName in that case 

498 if isinstance(task, str): 

499 task = doImport(task) 

500 label = task._DefaultName 

501 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

502 

503 def removeTask(self, label: str) -> None: 

504 """Remove a task from the pipeline. 

505 

506 Parameters 

507 ---------- 

508 label : `str` 

509 The label used to identify the task that is to be removed 

510 

511 Raises 

512 ------ 

513 KeyError 

514 If no task with that label exists in the pipeline 

515 

516 """ 

517 self._pipelineIR.tasks.pop(label) 

518 

519 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

520 """Apply single config override. 

521 

522 Parameters 

523 ---------- 

524 label : `str` 

525 Label of the task. 

526 key: `str` 

527 Fully-qualified field name. 

528 value : object 

529 Value to be given to a field. 

530 """ 

531 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

532 

533 def addConfigFile(self, label: str, filename: str) -> None: 

534 """Add overrides from a specified file. 

535 

536 Parameters 

537 ---------- 

538 label : `str` 

539 The label used to identify the task associated with config to 

540 modify 

541 filename : `str` 

542 Path to the override file. 

543 """ 

544 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

545 

546 def addConfigPython(self, label: str, pythonString: str) -> None: 

547 """Add Overrides by running a snippet of python code against a config. 

548 

549 Parameters 

550 ---------- 

551 label : `str` 

552 The label used to identity the task associated with config to 

553 modify. 

554 pythonString: `str` 

555 A string which is valid python code to be executed. This is done 

556 with config as the only local accessible value. 

557 """ 

558 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

559 

560 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

561 if label == "parameters": 

562 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

563 raise ValueError("Cannot override parameters that are not defined in pipeline") 

564 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

565 if newConfig.file: 

566 raise ValueError("Setting parameters section with config file is not supported") 

567 if newConfig.python: 

568 raise ValueError("Setting parameters section using python block in unsupported") 

569 return 

570 if label not in self._pipelineIR.tasks: 

571 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

572 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

573 

574 def toFile(self, filename: str) -> None: 

575 self._pipelineIR.to_file(filename) 

576 

577 def write_to_uri(self, uri: Union[str, ButlerURI]) -> None: 

578 # tasks need sorted each call because someone might have added or 

579 # removed task, and caching changes does not seem worth the small 

580 # overhead 

581 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

582 self._pipelineIR.reorder_tasks(labels) 

583 self._pipelineIR.write_to_uri(uri) 

584 

585 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

586 """Returns a generator of TaskDefs which can be used to create quantum 

587 graphs. 

588 

589 Returns 

590 ------- 

591 generator : generator of `TaskDef` 

592 The generator returned will be the sorted iterator of tasks which 

593 are to be used in constructing a quantum graph. 

594 

595 Raises 

596 ------ 

597 NotImplementedError 

598 If a dataId is supplied in a config block. This is in place for 

599 future use 

600 """ 

601 yield from self._toExpandedPipelineImpl() 

602 

603 def _toExpandedPipelineImpl(self, checkContracts=True) -> Iterable[TaskDef]: 

604 taskDefs = [] 

605 for label in self._pipelineIR.tasks: 

606 taskDefs.append(self._buildTaskDef(label)) 

607 

608 # lets evaluate the contracts 

609 if self._pipelineIR.contracts is not None: 

610 label_to_config = {x.label: x.config for x in taskDefs} 

611 for contract in self._pipelineIR.contracts: 

612 # execute this in its own line so it can raise a good error 

613 # message if there was problems with the eval 

614 success = eval(contract.contract, None, label_to_config) 

615 if not success: 

616 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

617 raise pipelineIR.ContractError(f"Contract(s) '{contract.contract}' were not " 

618 f"satisfied{extra_info}") 

619 

620 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

621 yield from pipeTools.orderPipeline(taskDefs) 

622 

623 def _buildTaskDef(self, label: str) -> TaskDef: 

624 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

625 raise NameError(f"Label {label} does not appear in this pipeline") 

626 taskClass = doImport(taskIR.klass) 

627 taskName = taskClass.__qualname__ 

628 config = taskClass.ConfigClass() 

629 overrides = ConfigOverrides() 

630 if self._pipelineIR.instrument is not None: 

631 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

632 if taskIR.config is not None: 

633 for configIR in (configIr.formatted(self._pipelineIR.parameters) 

634 for configIr in taskIR.config): 

635 if configIR.dataId is not None: 

636 raise NotImplementedError("Specializing a config on a partial data id is not yet " 

637 "supported in Pipeline definition") 

638 # only apply override if it applies to everything 

639 if configIR.dataId is None: 

640 if configIR.file: 

641 for configFile in configIR.file: 

642 overrides.addFileOverride(os.path.expandvars(configFile)) 

643 if configIR.python is not None: 

644 overrides.addPythonOverride(configIR.python) 

645 for key, value in configIR.rest.items(): 

646 overrides.addValueOverride(key, value) 

647 overrides.applyTo(config) 

648 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

649 

650 def __iter__(self) -> Generator[TaskDef, None, None]: 

651 return self.toExpandedPipeline() 

652 

653 def __getitem__(self, item: str) -> TaskDef: 

654 return self._buildTaskDef(item) 

655 

656 def __len__(self): 

657 return len(self._pipelineIR.tasks) 

658 

659 def __eq__(self, other: object): 

660 if not isinstance(other, Pipeline): 

661 return False 

662 return self._pipelineIR == other._pipelineIR 

663 

664 

665@dataclass(frozen=True) 

666class TaskDatasetTypes: 

667 """An immutable struct that extracts and classifies the dataset types used 

668 by a `PipelineTask` 

669 """ 

670 

671 initInputs: NamedValueSet[DatasetType] 

672 """Dataset types that are needed as inputs in order to construct this Task. 

673 

674 Task-level `initInputs` may be classified as either 

675 `~PipelineDatasetTypes.initInputs` or 

676 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

677 """ 

678 

679 initOutputs: NamedValueSet[DatasetType] 

680 """Dataset types that may be written after constructing this Task. 

681 

682 Task-level `initOutputs` may be classified as either 

683 `~PipelineDatasetTypes.initOutputs` or 

684 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

685 """ 

686 

687 inputs: NamedValueSet[DatasetType] 

688 """Dataset types that are regular inputs to this Task. 

689 

690 If an input dataset needed for a Quantum cannot be found in the input 

691 collection(s) or produced by another Task in the Pipeline, that Quantum 

692 (and all dependent Quanta) will not be produced. 

693 

694 Task-level `inputs` may be classified as either 

695 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

696 at the Pipeline level. 

697 """ 

698 

699 prerequisites: NamedValueSet[DatasetType] 

700 """Dataset types that are prerequisite inputs to this Task. 

701 

702 Prerequisite inputs must exist in the input collection(s) before the 

703 pipeline is run, but do not constrain the graph - if a prerequisite is 

704 missing for a Quantum, `PrerequisiteMissingError` is raised. 

705 

706 Prerequisite inputs are not resolved until the second stage of 

707 QuantumGraph generation. 

708 """ 

709 

710 outputs: NamedValueSet[DatasetType] 

711 """Dataset types that are produced by this Task. 

712 

713 Task-level `outputs` may be classified as either 

714 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

715 at the Pipeline level. 

716 """ 

717 

718 @classmethod 

719 def fromTaskDef( 

720 cls, 

721 taskDef: TaskDef, 

722 *, 

723 registry: Registry, 

724 include_configs: bool = True, 

725 storage_class_mapping: Optional[Mapping[str, str]] = None 

726 ) -> TaskDatasetTypes: 

727 """Extract and classify the dataset types from a single `PipelineTask`. 

728 

729 Parameters 

730 ---------- 

731 taskDef: `TaskDef` 

732 An instance of a `TaskDef` class for a particular `PipelineTask`. 

733 registry: `Registry` 

734 Registry used to construct normalized `DatasetType` objects and 

735 retrieve those that are incomplete. 

736 include_configs : `bool`, optional 

737 If `True` (default) include config dataset types as 

738 ``initOutputs``. 

739 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

740 If a taskdef contains a component dataset type that is unknown 

741 to the registry, its parent StorageClass will be looked up in this 

742 mapping if it is supplied. If the mapping does not contain the 

743 composite dataset type, or the mapping is not supplied an exception 

744 will be raised. 

745 

746 Returns 

747 ------- 

748 types: `TaskDatasetTypes` 

749 The dataset types used by this task. 

750 

751 Raises 

752 ------ 

753 ValueError 

754 Raised if dataset type connection definition differs from 

755 registry definition. 

756 LookupError 

757 Raised if component parent StorageClass could not be determined 

758 and storage_class_mapping does not contain the composite type, or 

759 is set to None. 

760 """ 

761 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]: 

762 """Constructs a set of true `DatasetType` objects 

763 

764 Parameters 

765 ---------- 

766 connectionType : `str` 

767 Name of the connection type to produce a set for, corresponds 

768 to an attribute of type `list` on the connection class instance 

769 freeze : `bool`, optional 

770 If `True`, call `NamedValueSet.freeze` on the object returned. 

771 

772 Returns 

773 ------- 

774 datasetTypes : `NamedValueSet` 

775 A set of all datasetTypes which correspond to the input 

776 connection type specified in the connection class of this 

777 `PipelineTask` 

778 

779 Raises 

780 ------ 

781 ValueError 

782 Raised if dataset type connection definition differs from 

783 registry definition. 

784 LookupError 

785 Raised if component parent StorageClass could not be determined 

786 and storage_class_mapping does not contain the composite type, 

787 or is set to None. 

788 

789 Notes 

790 ----- 

791 This function is a closure over the variables ``registry`` and 

792 ``taskDef``, and ``storage_class_mapping``. 

793 """ 

794 datasetTypes = NamedValueSet() 

795 for c in iterConnections(taskDef.connections, connectionType): 

796 dimensions = set(getattr(c, 'dimensions', set())) 

797 if "skypix" in dimensions: 

798 try: 

799 datasetType = registry.getDatasetType(c.name) 

800 except LookupError as err: 

801 raise LookupError( 

802 f"DatasetType '{c.name}' referenced by " 

803 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

804 f"placeholder, but does not already exist in the registry. " 

805 f"Note that reference catalog names are now used as the dataset " 

806 f"type name instead of 'ref_cat'." 

807 ) from err 

808 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

809 rest2 = set(dim.name for dim in datasetType.dimensions 

810 if not isinstance(dim, SkyPixDimension)) 

811 if rest1 != rest2: 

812 raise ValueError(f"Non-skypix dimensions for dataset type {c.name} declared in " 

813 f"connections ({rest1}) are inconsistent with those in " 

814 f"registry's version of this dataset ({rest2}).") 

815 else: 

816 # Component dataset types are not explicitly in the 

817 # registry. This complicates consistency checks with 

818 # registry and requires we work out the composite storage 

819 # class. 

820 registryDatasetType = None 

821 try: 

822 registryDatasetType = registry.getDatasetType(c.name) 

823 except KeyError: 

824 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

825 if componentName: 

826 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

827 raise LookupError("Component parent class cannot be determined, and " 

828 "composite name was not in storage class mapping, or no " 

829 "storage_class_mapping was supplied") 

830 else: 

831 parentStorageClass = storage_class_mapping[compositeName] 

832 else: 

833 parentStorageClass = None 

834 datasetType = c.makeDatasetType( 

835 registry.dimensions, 

836 parentStorageClass=parentStorageClass 

837 ) 

838 registryDatasetType = datasetType 

839 else: 

840 datasetType = c.makeDatasetType( 

841 registry.dimensions, 

842 parentStorageClass=registryDatasetType.parentStorageClass 

843 ) 

844 

845 if registryDatasetType and datasetType != registryDatasetType: 

846 try: 

847 # Explicitly check for storage class just to make 

848 # more specific message. 

849 _ = datasetType.storageClass 

850 except KeyError: 

851 raise ValueError("Storage class does not exist for supplied dataset type " 

852 f"{datasetType} for {taskDef.label}.") from None 

853 raise ValueError(f"Supplied dataset type ({datasetType}) inconsistent with " 

854 f"registry definition ({registryDatasetType}) " 

855 f"for {taskDef.label}.") 

856 datasetTypes.add(datasetType) 

857 if freeze: 

858 datasetTypes.freeze() 

859 return datasetTypes 

860 

861 # optionally add initOutput dataset for config 

862 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False) 

863 if include_configs: 

864 initOutputs.add( 

865 DatasetType( 

866 taskDef.configDatasetName, 

867 registry.dimensions.empty, 

868 storageClass="Config", 

869 ) 

870 ) 

871 initOutputs.freeze() 

872 

873 # optionally add output dataset for metadata 

874 outputs = makeDatasetTypesSet("outputs", freeze=False) 

875 if taskDef.metadataDatasetName is not None: 

876 # Metadata is supposed to be of the PropertySet type, its 

877 # dimensions correspond to a task quantum 

878 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

879 if _TASK_METADATA_TYPE is TaskMetadata: 

880 storageClass = "TaskMetadata" 

881 else: 

882 storageClass = "PropertySet" 

883 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)} 

884 if taskDef.logOutputDatasetName is not None: 

885 # Log output dimensions correspond to a task quantum. 

886 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

887 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")} 

888 

889 outputs.freeze() 

890 

891 return cls( 

892 initInputs=makeDatasetTypesSet("initInputs"), 

893 initOutputs=initOutputs, 

894 inputs=makeDatasetTypesSet("inputs"), 

895 prerequisites=makeDatasetTypesSet("prerequisiteInputs"), 

896 outputs=outputs, 

897 ) 

898 

899 

900@dataclass(frozen=True) 

901class PipelineDatasetTypes: 

902 """An immutable struct that classifies the dataset types used in a 

903 `Pipeline`. 

904 """ 

905 

906 packagesDatasetName: ClassVar[str] = "packages" 

907 """Name of a dataset type used to save package versions. 

908 """ 

909 

910 initInputs: NamedValueSet[DatasetType] 

911 """Dataset types that are needed as inputs in order to construct the Tasks 

912 in this Pipeline. 

913 

914 This does not include dataset types that are produced when constructing 

915 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

916 """ 

917 

918 initOutputs: NamedValueSet[DatasetType] 

919 """Dataset types that may be written after constructing the Tasks in this 

920 Pipeline. 

921 

922 This does not include dataset types that are also used as inputs when 

923 constructing other Tasks in the Pipeline (these are classified as 

924 `initIntermediates`). 

925 """ 

926 

927 initIntermediates: NamedValueSet[DatasetType] 

928 """Dataset types that are both used when constructing one or more Tasks 

929 in the Pipeline and produced as a side-effect of constructing another 

930 Task in the Pipeline. 

931 """ 

932 

933 inputs: NamedValueSet[DatasetType] 

934 """Dataset types that are regular inputs for the full pipeline. 

935 

936 If an input dataset needed for a Quantum cannot be found in the input 

937 collection(s), that Quantum (and all dependent Quanta) will not be 

938 produced. 

939 """ 

940 

941 prerequisites: NamedValueSet[DatasetType] 

942 """Dataset types that are prerequisite inputs for the full Pipeline. 

943 

944 Prerequisite inputs must exist in the input collection(s) before the 

945 pipeline is run, but do not constrain the graph - if a prerequisite is 

946 missing for a Quantum, `PrerequisiteMissingError` is raised. 

947 

948 Prerequisite inputs are not resolved until the second stage of 

949 QuantumGraph generation. 

950 """ 

951 

952 intermediates: NamedValueSet[DatasetType] 

953 """Dataset types that are output by one Task in the Pipeline and consumed 

954 as inputs by one or more other Tasks in the Pipeline. 

955 """ 

956 

957 outputs: NamedValueSet[DatasetType] 

958 """Dataset types that are output by a Task in the Pipeline and not consumed 

959 by any other Task in the Pipeline. 

960 """ 

961 

962 byTask: Mapping[str, TaskDatasetTypes] 

963 """Per-Task dataset types, keyed by label in the `Pipeline`. 

964 

965 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

966 neither has been modified since the dataset types were extracted, of 

967 course). 

968 """ 

969 

970 @classmethod 

971 def fromPipeline( 

972 cls, 

973 pipeline: Union[Pipeline, Iterable[TaskDef]], 

974 *, 

975 registry: Registry, 

976 include_configs: bool = True, 

977 include_packages: bool = True, 

978 ) -> PipelineDatasetTypes: 

979 """Extract and classify the dataset types from all tasks in a 

980 `Pipeline`. 

981 

982 Parameters 

983 ---------- 

984 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

985 A collection of tasks that can be run together. 

986 registry: `Registry` 

987 Registry used to construct normalized `DatasetType` objects and 

988 retrieve those that are incomplete. 

989 include_configs : `bool`, optional 

990 If `True` (default) include config dataset types as 

991 ``initOutputs``. 

992 include_packages : `bool`, optional 

993 If `True` (default) include the dataset type for software package 

994 versions in ``initOutputs``. 

995 

996 Returns 

997 ------- 

998 types: `PipelineDatasetTypes` 

999 The dataset types used by this `Pipeline`. 

1000 

1001 Raises 

1002 ------ 

1003 ValueError 

1004 Raised if Tasks are inconsistent about which datasets are marked 

1005 prerequisite. This indicates that the Tasks cannot be run as part 

1006 of the same `Pipeline`. 

1007 """ 

1008 allInputs = NamedValueSet() 

1009 allOutputs = NamedValueSet() 

1010 allInitInputs = NamedValueSet() 

1011 allInitOutputs = NamedValueSet() 

1012 prerequisites = NamedValueSet() 

1013 byTask = dict() 

1014 if include_packages: 

1015 allInitOutputs.add( 

1016 DatasetType( 

1017 cls.packagesDatasetName, 

1018 registry.dimensions.empty, 

1019 storageClass="Packages", 

1020 ) 

1021 ) 

1022 # create a list of TaskDefs in case the input is a generator 

1023 pipeline = list(pipeline) 

1024 

1025 # collect all the output dataset types 

1026 typeStorageclassMap: Dict[str, str] = {} 

1027 for taskDef in pipeline: 

1028 for outConnection in iterConnections(taskDef.connections, 'outputs'): 

1029 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1030 

1031 for taskDef in pipeline: 

1032 thisTask = TaskDatasetTypes.fromTaskDef( 

1033 taskDef, 

1034 registry=registry, 

1035 include_configs=include_configs, 

1036 storage_class_mapping=typeStorageclassMap 

1037 ) 

1038 allInitInputs |= thisTask.initInputs 

1039 allInitOutputs |= thisTask.initOutputs 

1040 allInputs |= thisTask.inputs 

1041 prerequisites |= thisTask.prerequisites 

1042 allOutputs |= thisTask.outputs 

1043 byTask[taskDef.label] = thisTask 

1044 if not prerequisites.isdisjoint(allInputs): 

1045 raise ValueError("{} marked as both prerequisites and regular inputs".format( 

1046 {dt.name for dt in allInputs & prerequisites} 

1047 )) 

1048 if not prerequisites.isdisjoint(allOutputs): 

1049 raise ValueError("{} marked as both prerequisites and outputs".format( 

1050 {dt.name for dt in allOutputs & prerequisites} 

1051 )) 

1052 # Make sure that components which are marked as inputs get treated as 

1053 # intermediates if there is an output which produces the composite 

1054 # containing the component 

1055 intermediateComponents = NamedValueSet() 

1056 intermediateComposites = NamedValueSet() 

1057 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1058 for dsType in allInputs: 

1059 # get the name of a possible component 

1060 name, component = dsType.nameAndComponent() 

1061 # if there is a component name, that means this is a component 

1062 # DatasetType, if there is an output which produces the parent of 

1063 # this component, treat this input as an intermediate 

1064 if component is not None: 

1065 # This needs to be in this if block, because someone might have 

1066 # a composite that is a pure input from existing data 

1067 if name in outputNameMapping: 

1068 intermediateComponents.add(dsType) 

1069 intermediateComposites.add(outputNameMapping[name]) 

1070 

1071 def checkConsistency(a: NamedValueSet, b: NamedValueSet): 

1072 common = a.names & b.names 

1073 for name in common: 

1074 if a[name] != b[name]: 

1075 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1076 

1077 checkConsistency(allInitInputs, allInitOutputs) 

1078 checkConsistency(allInputs, allOutputs) 

1079 checkConsistency(allInputs, intermediateComposites) 

1080 checkConsistency(allOutputs, intermediateComposites) 

1081 

1082 def frozen(s: NamedValueSet) -> NamedValueSet: 

1083 s.freeze() 

1084 return s 

1085 

1086 return cls( 

1087 initInputs=frozen(allInitInputs - allInitOutputs), 

1088 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1089 initOutputs=frozen(allInitOutputs - allInitInputs), 

1090 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1091 intermediates=frozen(allInputs & allOutputs | intermediateComponents), 

1092 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1093 prerequisites=frozen(prerequisites), 

1094 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1095 ) 

1096 

1097 @classmethod 

1098 def initOutputNames(cls, pipeline: Union[Pipeline, Iterable[TaskDef]], *, 

1099 include_configs: bool = True, include_packages: bool = True) -> Iterator[str]: 

1100 """Return the names of dataset types ot task initOutputs, Configs, 

1101 and package versions for a pipeline. 

1102 

1103 Parameters 

1104 ---------- 

1105 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1106 A `Pipeline` instance or collection of `TaskDef` instances. 

1107 include_configs : `bool`, optional 

1108 If `True` (default) include config dataset types. 

1109 include_packages : `bool`, optional 

1110 If `True` (default) include the dataset type for package versions. 

1111 

1112 Yields 

1113 ------ 

1114 datasetTypeName : `str` 

1115 Name of the dataset type. 

1116 """ 

1117 if include_packages: 

1118 # Package versions dataset type 

1119 yield cls.packagesDatasetName 

1120 

1121 if isinstance(pipeline, Pipeline): 

1122 pipeline = pipeline.toExpandedPipeline() 

1123 

1124 for taskDef in pipeline: 

1125 

1126 # all task InitOutputs 

1127 for name in taskDef.connections.initOutputs: 

1128 attribute = getattr(taskDef.connections, name) 

1129 yield attribute.name 

1130 

1131 # config dataset name 

1132 if include_configs: 

1133 yield taskDef.configDatasetName