Coverage for python/lsst/pipe/base/pipeline.py: 18%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

394 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import os 

31import re 

32import urllib.parse 

33import warnings 

34 

35# ------------------------------- 

36# Imports of standard modules -- 

37# ------------------------------- 

38from dataclasses import dataclass 

39from types import MappingProxyType 

40from typing import ( 

41 TYPE_CHECKING, 

42 ClassVar, 

43 Dict, 

44 Generator, 

45 Iterable, 

46 Iterator, 

47 Mapping, 

48 Optional, 

49 Set, 

50 Tuple, 

51 Union, 

52) 

53 

54# ----------------------------- 

55# Imports for other modules -- 

56from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension 

57from lsst.resources import ResourcePath, ResourcePathExpression 

58from lsst.utils import doImport 

59from lsst.utils.introspection import get_full_type_name 

60 

61from . import pipelineIR, pipeTools 

62from ._task_metadata import TaskMetadata 

63from .configOverrides import ConfigOverrides 

64from .connections import iterConnections 

65from .pipelineTask import PipelineTask 

66from .task import _TASK_METADATA_TYPE 

67 

68if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 68 ↛ 69line 68 didn't jump to line 69, because the condition on line 68 was never true

69 from lsst.obs.base import Instrument 

70 

71# ---------------------------------- 

72# Local non-exported definitions -- 

73# ---------------------------------- 

74 

75_LOG = logging.getLogger(__name__) 

76 

77# ------------------------ 

78# Exported definitions -- 

79# ------------------------ 

80 

81 

82@dataclass 

83class LabelSpecifier: 

84 """A structure to specify a subset of labels to load 

85 

86 This structure may contain a set of labels to be used in subsetting a 

87 pipeline, or a beginning and end point. Beginning or end may be empty, 

88 in which case the range will be a half open interval. Unlike python 

89 iteration bounds, end bounds are *INCLUDED*. Note that range based 

90 selection is not well defined for pipelines that are not linear in nature, 

91 and correct behavior is not guaranteed, or may vary from run to run. 

92 """ 

93 

94 labels: Optional[Set[str]] = None 

95 begin: Optional[str] = None 

96 end: Optional[str] = None 

97 

98 def __post_init__(self): 

99 if self.labels is not None and (self.begin or self.end): 

100 raise ValueError( 

101 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

102 ) 

103 

104 

105class TaskDef: 

106 """TaskDef is a collection of information about task needed by Pipeline. 

107 

108 The information includes task name, configuration object and optional 

109 task class. This class is just a collection of attributes and it exposes 

110 all of them so that attributes could potentially be modified in place 

111 (e.g. if configuration needs extra overrides). 

112 

113 Attributes 

114 ---------- 

115 taskName : `str`, optional 

116 `PipelineTask` class name, currently it is not specified whether this 

117 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 

118 Framework should be prepared to handle all cases. If not provided, 

119 ``taskClass`` must be, and ``taskClass.__name__`` is used. 

120 config : `lsst.pex.config.Config`, optional 

121 Instance of the configuration class corresponding to this task class, 

122 usually with all overrides applied. This config will be frozen. If 

123 not provided, ``taskClass`` must be provided and 

124 ``taskClass.ConfigClass()`` will be used. 

125 taskClass : `type`, optional 

126 `PipelineTask` class object, can be ``None``. If ``None`` then 

127 framework will have to locate and load class. 

128 label : `str`, optional 

129 Task label, usually a short string unique in a pipeline. If not 

130 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

131 be used. 

132 """ 

133 

134 def __init__(self, taskName=None, config=None, taskClass=None, label=None): 

135 if taskName is None: 

136 if taskClass is None: 

137 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

138 taskName = taskClass.__name__ 

139 if config is None: 

140 if taskClass is None: 

141 raise ValueError("`taskClass` must be provided if `config` is not.") 

142 config = taskClass.ConfigClass() 

143 if label is None: 

144 if taskClass is None: 

145 raise ValueError("`taskClass` must be provided if `label` is not.") 

146 label = taskClass._DefaultName 

147 self.taskName = taskName 

148 try: 

149 config.validate() 

150 except Exception: 

151 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

152 raise 

153 config.freeze() 

154 self.config = config 

155 self.taskClass = taskClass 

156 self.label = label 

157 self.connections = config.connections.ConnectionsClass(config=config) 

158 

159 @property 

160 def configDatasetName(self) -> str: 

161 """Name of a dataset type for configuration of this task (`str`)""" 

162 return self.label + "_config" 

163 

164 @property 

165 def metadataDatasetName(self) -> Optional[str]: 

166 """Name of a dataset type for metadata of this task, `None` if 

167 metadata is not to be saved (`str`) 

168 """ 

169 if self.config.saveMetadata: 

170 return self.label + "_metadata" 

171 else: 

172 return None 

173 

174 @property 

175 def logOutputDatasetName(self) -> Optional[str]: 

176 """Name of a dataset type for log output from this task, `None` if 

177 logs are not to be saved (`str`) 

178 """ 

179 if self.config.saveLogOutput: 

180 return self.label + "_log" 

181 else: 

182 return None 

183 

184 def __str__(self): 

185 rep = "TaskDef(" + self.taskName 

186 if self.label: 

187 rep += ", label=" + self.label 

188 rep += ")" 

189 return rep 

190 

191 def __eq__(self, other: object) -> bool: 

192 if not isinstance(other, TaskDef): 

193 return False 

194 # This does not consider equality of configs when determining equality 

195 # as config equality is a difficult thing to define. Should be updated 

196 # after DM-27847 

197 return self.taskClass == other.taskClass and self.label == other.label 

198 

199 def __hash__(self): 

200 return hash((self.taskClass, self.label)) 

201 

202 

203class Pipeline: 

204 """A `Pipeline` is a representation of a series of tasks to run, and the 

205 configuration for those tasks. 

206 

207 Parameters 

208 ---------- 

209 description : `str` 

210 A description of that this pipeline does. 

211 """ 

212 

213 def __init__(self, description: str): 

214 pipeline_dict = {"description": description, "tasks": {}} 

215 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

216 

217 @classmethod 

218 def fromFile(cls, filename: str) -> Pipeline: 

219 """Load a pipeline defined in a pipeline yaml file. 

220 

221 Parameters 

222 ---------- 

223 filename: `str` 

224 A path that points to a pipeline defined in yaml format. This 

225 filename may also supply additional labels to be used in 

226 subsetting the loaded Pipeline. These labels are separated from 

227 the path by a \\#, and may be specified as a comma separated 

228 list, or a range denoted as beginning..end. Beginning or end may 

229 be empty, in which case the range will be a half open interval. 

230 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

231 that range based selection is not well defined for pipelines that 

232 are not linear in nature, and correct behavior is not guaranteed, 

233 or may vary from run to run. 

234 

235 Returns 

236 ------- 

237 pipeline: `Pipeline` 

238 The pipeline loaded from specified location with appropriate (if 

239 any) subsetting 

240 

241 Notes 

242 ----- 

243 This method attempts to prune any contracts that contain labels which 

244 are not in the declared subset of labels. This pruning is done using a 

245 string based matching due to the nature of contracts and may prune more 

246 than it should. 

247 """ 

248 return cls.from_uri(filename) 

249 

250 @classmethod 

251 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

252 """Load a pipeline defined in a pipeline yaml file at a location 

253 specified by a URI. 

254 

255 Parameters 

256 ---------- 

257 uri: convertible to `ResourcePath` 

258 If a string is supplied this should be a URI path that points to a 

259 pipeline defined in yaml format, either as a direct path to the 

260 yaml file, or as a directory containing a "pipeline.yaml" file (the 

261 form used by `write_to_uri` with ``expand=True``). This uri may 

262 also supply additional labels to be used in subsetting the loaded 

263 Pipeline. These labels are separated from the path by a \\#, and 

264 may be specified as a comma separated list, or a range denoted as 

265 beginning..end. Beginning or end may be empty, in which case the 

266 range will be a half open interval. Unlike python iteration bounds, 

267 end bounds are *INCLUDED*. Note that range based selection is not 

268 well defined for pipelines that are not linear in nature, and 

269 correct behavior is not guaranteed, or may vary from run to run. 

270 The same specifiers can be used with a `ResourcePath` object, by 

271 being the sole contents in the fragments attribute. 

272 

273 Returns 

274 ------- 

275 pipeline: `Pipeline` 

276 The pipeline loaded from specified location with appropriate (if 

277 any) subsetting 

278 

279 Notes 

280 ----- 

281 This method attempts to prune any contracts that contain labels which 

282 are not in the declared subset of labels. This pruning is done using a 

283 string based matching due to the nature of contracts and may prune more 

284 than it should. 

285 """ 

286 # Split up the uri and any labels that were supplied 

287 uri, label_specifier = cls._parse_file_specifier(uri) 

288 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

289 

290 # If there are labels supplied, only keep those 

291 if label_specifier is not None: 

292 pipeline = pipeline.subsetFromLabels(label_specifier) 

293 return pipeline 

294 

295 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

296 """Subset a pipeline to contain only labels specified in labelSpecifier 

297 

298 Parameters 

299 ---------- 

300 labelSpecifier : `labelSpecifier` 

301 Object containing labels that describes how to subset a pipeline. 

302 

303 Returns 

304 ------- 

305 pipeline : `Pipeline` 

306 A new pipeline object that is a subset of the old pipeline 

307 

308 Raises 

309 ------ 

310 ValueError 

311 Raised if there is an issue with specified labels 

312 

313 Notes 

314 ----- 

315 This method attempts to prune any contracts that contain labels which 

316 are not in the declared subset of labels. This pruning is done using a 

317 string based matching due to the nature of contracts and may prune more 

318 than it should. 

319 """ 

320 # Labels supplied as a set 

321 if labelSpecifier.labels: 

322 labelSet = labelSpecifier.labels 

323 # Labels supplied as a range, first create a list of all the labels 

324 # in the pipeline sorted according to task dependency. Then only 

325 # keep labels that lie between the supplied bounds 

326 else: 

327 # Create a copy of the pipeline to use when assessing the label 

328 # ordering. Use a dict for fast searching while preserving order. 

329 # Remove contracts so they do not fail in the expansion step. This 

330 # is needed because a user may only configure the tasks they intend 

331 # to run, which may cause some contracts to fail if they will later 

332 # be dropped 

333 pipeline = copy.deepcopy(self) 

334 pipeline._pipelineIR.contracts = [] 

335 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

336 

337 # Verify the bounds are in the labels 

338 if labelSpecifier.begin is not None: 

339 if labelSpecifier.begin not in labels: 

340 raise ValueError( 

341 f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

342 "pipeline definition" 

343 ) 

344 if labelSpecifier.end is not None: 

345 if labelSpecifier.end not in labels: 

346 raise ValueError( 

347 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

348 ) 

349 

350 labelSet = set() 

351 for label in labels: 

352 if labelSpecifier.begin is not None: 

353 if label != labelSpecifier.begin: 

354 continue 

355 else: 

356 labelSpecifier.begin = None 

357 labelSet.add(label) 

358 if labelSpecifier.end is not None and label == labelSpecifier.end: 

359 break 

360 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

361 

362 @staticmethod 

363 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]: 

364 """Split appart a uri and any possible label subsets""" 

365 if isinstance(uri, str): 

366 # This is to support legacy pipelines during transition 

367 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

368 if num_replace: 

369 warnings.warn( 

370 f"The pipeline file {uri} seems to use the legacy : to separate " 

371 "labels, this is deprecated and will be removed after June 2021, please use " 

372 "# instead.", 

373 category=FutureWarning, 

374 ) 

375 if uri.count("#") > 1: 

376 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

377 # Everything else can be converted directly to ResourcePath. 

378 uri = ResourcePath(uri) 

379 label_subset = uri.fragment or None 

380 

381 specifier: Optional[LabelSpecifier] 

382 if label_subset is not None: 

383 label_subset = urllib.parse.unquote(label_subset) 

384 args: Dict[str, Union[Set[str], str, None]] 

385 # labels supplied as a list 

386 if "," in label_subset: 

387 if ".." in label_subset: 

388 raise ValueError( 

389 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

390 ) 

391 args = {"labels": set(label_subset.split(","))} 

392 # labels supplied as a range 

393 elif ".." in label_subset: 

394 # Try to de-structure the labelSubset, this will fail if more 

395 # than one range is specified 

396 begin, end, *rest = label_subset.split("..") 

397 if rest: 

398 raise ValueError("Only one range can be specified when loading a pipeline") 

399 args = {"begin": begin if begin else None, "end": end if end else None} 

400 # Assume anything else is a single label 

401 else: 

402 args = {"labels": {label_subset}} 

403 

404 specifier = LabelSpecifier(**args) 

405 else: 

406 specifier = None 

407 

408 return uri, specifier 

409 

410 @classmethod 

411 def fromString(cls, pipeline_string: str) -> Pipeline: 

412 """Create a pipeline from string formatted as a pipeline document. 

413 

414 Parameters 

415 ---------- 

416 pipeline_string : `str` 

417 A string that is formatted according like a pipeline document 

418 

419 Returns 

420 ------- 

421 pipeline: `Pipeline` 

422 """ 

423 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

424 return pipeline 

425 

426 @classmethod 

427 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

428 """Create a pipeline from an already created `PipelineIR` object. 

429 

430 Parameters 

431 ---------- 

432 deserialized_pipeline: `PipelineIR` 

433 An already created pipeline intermediate representation object 

434 

435 Returns 

436 ------- 

437 pipeline: `Pipeline` 

438 """ 

439 pipeline = cls.__new__(cls) 

440 pipeline._pipelineIR = deserialized_pipeline 

441 return pipeline 

442 

443 @classmethod 

444 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline: 

445 """Create a new pipeline by copying an already existing `Pipeline`. 

446 

447 Parameters 

448 ---------- 

449 pipeline: `Pipeline` 

450 An already created pipeline intermediate representation object 

451 

452 Returns 

453 ------- 

454 pipeline: `Pipeline` 

455 """ 

456 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

457 

458 def __str__(self) -> str: 

459 # tasks need sorted each call because someone might have added or 

460 # removed task, and caching changes does not seem worth the small 

461 # overhead 

462 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

463 self._pipelineIR.reorder_tasks(labels) 

464 return str(self._pipelineIR) 

465 

466 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

467 """Add an instrument to the pipeline, or replace an instrument that is 

468 already defined. 

469 

470 Parameters 

471 ---------- 

472 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

473 Either a derived class object of a `lsst.daf.butler.instrument` or 

474 a string corresponding to a fully qualified 

475 `lsst.daf.butler.instrument` name. 

476 """ 

477 if isinstance(instrument, str): 

478 pass 

479 else: 

480 # TODO: assume that this is a subclass of Instrument, no type 

481 # checking 

482 instrument = get_full_type_name(instrument) 

483 self._pipelineIR.instrument = instrument 

484 

485 def getInstrument(self) -> Instrument: 

486 """Get the instrument from the pipeline. 

487 

488 Returns 

489 ------- 

490 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None 

491 A derived class object of a `lsst.daf.butler.instrument`, a string 

492 corresponding to a fully qualified `lsst.daf.butler.instrument` 

493 name, or None if the pipeline does not have an instrument. 

494 """ 

495 return self._pipelineIR.instrument 

496 

497 def addTask(self, task: Union[PipelineTask, str], label: str) -> None: 

498 """Add a new task to the pipeline, or replace a task that is already 

499 associated with the supplied label. 

500 

501 Parameters 

502 ---------- 

503 task: `PipelineTask` or `str` 

504 Either a derived class object of a `PipelineTask` or a string 

505 corresponding to a fully qualified `PipelineTask` name. 

506 label: `str` 

507 A label that is used to identify the `PipelineTask` being added 

508 """ 

509 if isinstance(task, str): 

510 taskName = task 

511 elif issubclass(task, PipelineTask): 

512 taskName = get_full_type_name(task) 

513 else: 

514 raise ValueError( 

515 "task must be either a child class of PipelineTask or a string containing" 

516 " a fully qualified name to one" 

517 ) 

518 if not label: 

519 # in some cases (with command line-generated pipeline) tasks can 

520 # be defined without label which is not acceptable, use task 

521 # _DefaultName in that case 

522 if isinstance(task, str): 

523 task = doImport(task) 

524 label = task._DefaultName 

525 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

526 

527 def removeTask(self, label: str) -> None: 

528 """Remove a task from the pipeline. 

529 

530 Parameters 

531 ---------- 

532 label : `str` 

533 The label used to identify the task that is to be removed 

534 

535 Raises 

536 ------ 

537 KeyError 

538 If no task with that label exists in the pipeline 

539 

540 """ 

541 self._pipelineIR.tasks.pop(label) 

542 

543 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

544 """Apply single config override. 

545 

546 Parameters 

547 ---------- 

548 label : `str` 

549 Label of the task. 

550 key: `str` 

551 Fully-qualified field name. 

552 value : object 

553 Value to be given to a field. 

554 """ 

555 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

556 

557 def addConfigFile(self, label: str, filename: str) -> None: 

558 """Add overrides from a specified file. 

559 

560 Parameters 

561 ---------- 

562 label : `str` 

563 The label used to identify the task associated with config to 

564 modify 

565 filename : `str` 

566 Path to the override file. 

567 """ 

568 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

569 

570 def addConfigPython(self, label: str, pythonString: str) -> None: 

571 """Add Overrides by running a snippet of python code against a config. 

572 

573 Parameters 

574 ---------- 

575 label : `str` 

576 The label used to identity the task associated with config to 

577 modify. 

578 pythonString: `str` 

579 A string which is valid python code to be executed. This is done 

580 with config as the only local accessible value. 

581 """ 

582 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

583 

584 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

585 if label == "parameters": 

586 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

587 raise ValueError("Cannot override parameters that are not defined in pipeline") 

588 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

589 if newConfig.file: 

590 raise ValueError("Setting parameters section with config file is not supported") 

591 if newConfig.python: 

592 raise ValueError("Setting parameters section using python block in unsupported") 

593 return 

594 if label not in self._pipelineIR.tasks: 

595 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

596 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

597 

598 def toFile(self, filename: str) -> None: 

599 self._pipelineIR.to_file(filename) 

600 

601 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

602 """Write the pipeline to a file or directory. 

603 

604 Parameters 

605 ---------- 

606 uri : convertible to `ResourcePath` 

607 URI to write to; may have any scheme with `ResourcePath` write 

608 support or no scheme for a local file/directory. Should have a 

609 ``.yaml``. 

610 """ 

611 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

612 self._pipelineIR.reorder_tasks(labels) 

613 self._pipelineIR.write_to_uri(uri) 

614 

615 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

616 """Returns a generator of TaskDefs which can be used to create quantum 

617 graphs. 

618 

619 Returns 

620 ------- 

621 generator : generator of `TaskDef` 

622 The generator returned will be the sorted iterator of tasks which 

623 are to be used in constructing a quantum graph. 

624 

625 Raises 

626 ------ 

627 NotImplementedError 

628 If a dataId is supplied in a config block. This is in place for 

629 future use 

630 """ 

631 yield from self._toExpandedPipelineImpl() 

632 

633 def _toExpandedPipelineImpl(self, checkContracts=True) -> Iterable[TaskDef]: 

634 taskDefs = [] 

635 for label in self._pipelineIR.tasks: 

636 taskDefs.append(self._buildTaskDef(label)) 

637 

638 # lets evaluate the contracts 

639 if self._pipelineIR.contracts is not None: 

640 label_to_config = {x.label: x.config for x in taskDefs} 

641 for contract in self._pipelineIR.contracts: 

642 # execute this in its own line so it can raise a good error 

643 # message if there was problems with the eval 

644 success = eval(contract.contract, None, label_to_config) 

645 if not success: 

646 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

647 raise pipelineIR.ContractError( 

648 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

649 ) 

650 

651 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

652 yield from pipeTools.orderPipeline(taskDefs) 

653 

654 def _buildTaskDef(self, label: str) -> TaskDef: 

655 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

656 raise NameError(f"Label {label} does not appear in this pipeline") 

657 taskClass = doImport(taskIR.klass) 

658 taskName = taskClass.__qualname__ 

659 config = taskClass.ConfigClass() 

660 overrides = ConfigOverrides() 

661 if self._pipelineIR.instrument is not None: 

662 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

663 if taskIR.config is not None: 

664 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config): 

665 if configIR.dataId is not None: 

666 raise NotImplementedError( 

667 "Specializing a config on a partial data id is not yet " 

668 "supported in Pipeline definition" 

669 ) 

670 # only apply override if it applies to everything 

671 if configIR.dataId is None: 

672 if configIR.file: 

673 for configFile in configIR.file: 

674 overrides.addFileOverride(os.path.expandvars(configFile)) 

675 if configIR.python is not None: 

676 overrides.addPythonOverride(configIR.python) 

677 for key, value in configIR.rest.items(): 

678 overrides.addValueOverride(key, value) 

679 overrides.applyTo(config) 

680 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

681 

682 def __iter__(self) -> Generator[TaskDef, None, None]: 

683 return self.toExpandedPipeline() 

684 

685 def __getitem__(self, item: str) -> TaskDef: 

686 return self._buildTaskDef(item) 

687 

688 def __len__(self): 

689 return len(self._pipelineIR.tasks) 

690 

691 def __eq__(self, other: object): 

692 if not isinstance(other, Pipeline): 

693 return False 

694 return self._pipelineIR == other._pipelineIR 

695 

696 

697@dataclass(frozen=True) 

698class TaskDatasetTypes: 

699 """An immutable struct that extracts and classifies the dataset types used 

700 by a `PipelineTask` 

701 """ 

702 

703 initInputs: NamedValueSet[DatasetType] 

704 """Dataset types that are needed as inputs in order to construct this Task. 

705 

706 Task-level `initInputs` may be classified as either 

707 `~PipelineDatasetTypes.initInputs` or 

708 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

709 """ 

710 

711 initOutputs: NamedValueSet[DatasetType] 

712 """Dataset types that may be written after constructing this Task. 

713 

714 Task-level `initOutputs` may be classified as either 

715 `~PipelineDatasetTypes.initOutputs` or 

716 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

717 """ 

718 

719 inputs: NamedValueSet[DatasetType] 

720 """Dataset types that are regular inputs to this Task. 

721 

722 If an input dataset needed for a Quantum cannot be found in the input 

723 collection(s) or produced by another Task in the Pipeline, that Quantum 

724 (and all dependent Quanta) will not be produced. 

725 

726 Task-level `inputs` may be classified as either 

727 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

728 at the Pipeline level. 

729 """ 

730 

731 prerequisites: NamedValueSet[DatasetType] 

732 """Dataset types that are prerequisite inputs to this Task. 

733 

734 Prerequisite inputs must exist in the input collection(s) before the 

735 pipeline is run, but do not constrain the graph - if a prerequisite is 

736 missing for a Quantum, `PrerequisiteMissingError` is raised. 

737 

738 Prerequisite inputs are not resolved until the second stage of 

739 QuantumGraph generation. 

740 """ 

741 

742 outputs: NamedValueSet[DatasetType] 

743 """Dataset types that are produced by this Task. 

744 

745 Task-level `outputs` may be classified as either 

746 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

747 at the Pipeline level. 

748 """ 

749 

750 @classmethod 

751 def fromTaskDef( 

752 cls, 

753 taskDef: TaskDef, 

754 *, 

755 registry: Registry, 

756 include_configs: bool = True, 

757 storage_class_mapping: Optional[Mapping[str, str]] = None, 

758 ) -> TaskDatasetTypes: 

759 """Extract and classify the dataset types from a single `PipelineTask`. 

760 

761 Parameters 

762 ---------- 

763 taskDef: `TaskDef` 

764 An instance of a `TaskDef` class for a particular `PipelineTask`. 

765 registry: `Registry` 

766 Registry used to construct normalized `DatasetType` objects and 

767 retrieve those that are incomplete. 

768 include_configs : `bool`, optional 

769 If `True` (default) include config dataset types as 

770 ``initOutputs``. 

771 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

772 If a taskdef contains a component dataset type that is unknown 

773 to the registry, its parent StorageClass will be looked up in this 

774 mapping if it is supplied. If the mapping does not contain the 

775 composite dataset type, or the mapping is not supplied an exception 

776 will be raised. 

777 

778 Returns 

779 ------- 

780 types: `TaskDatasetTypes` 

781 The dataset types used by this task. 

782 

783 Raises 

784 ------ 

785 ValueError 

786 Raised if dataset type connection definition differs from 

787 registry definition. 

788 LookupError 

789 Raised if component parent StorageClass could not be determined 

790 and storage_class_mapping does not contain the composite type, or 

791 is set to None. 

792 """ 

793 

794 def makeDatasetTypesSet( 

795 connectionType: str, 

796 is_input: bool, 

797 freeze: bool = True, 

798 ) -> NamedValueSet[DatasetType]: 

799 """Constructs a set of true `DatasetType` objects 

800 

801 Parameters 

802 ---------- 

803 connectionType : `str` 

804 Name of the connection type to produce a set for, corresponds 

805 to an attribute of type `list` on the connection class instance 

806 is_input : `bool` 

807 These are input dataset types, else they are output dataset 

808 types. 

809 freeze : `bool`, optional 

810 If `True`, call `NamedValueSet.freeze` on the object returned. 

811 

812 Returns 

813 ------- 

814 datasetTypes : `NamedValueSet` 

815 A set of all datasetTypes which correspond to the input 

816 connection type specified in the connection class of this 

817 `PipelineTask` 

818 

819 Raises 

820 ------ 

821 ValueError 

822 Raised if dataset type connection definition differs from 

823 registry definition. 

824 LookupError 

825 Raised if component parent StorageClass could not be determined 

826 and storage_class_mapping does not contain the composite type, 

827 or is set to None. 

828 

829 Notes 

830 ----- 

831 This function is a closure over the variables ``registry`` and 

832 ``taskDef``, and ``storage_class_mapping``. 

833 """ 

834 datasetTypes = NamedValueSet() 

835 for c in iterConnections(taskDef.connections, connectionType): 

836 dimensions = set(getattr(c, "dimensions", set())) 

837 if "skypix" in dimensions: 

838 try: 

839 datasetType = registry.getDatasetType(c.name) 

840 except LookupError as err: 

841 raise LookupError( 

842 f"DatasetType '{c.name}' referenced by " 

843 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

844 f"placeholder, but does not already exist in the registry. " 

845 f"Note that reference catalog names are now used as the dataset " 

846 f"type name instead of 'ref_cat'." 

847 ) from err 

848 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

849 rest2 = set( 

850 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

851 ) 

852 if rest1 != rest2: 

853 raise ValueError( 

854 f"Non-skypix dimensions for dataset type {c.name} declared in " 

855 f"connections ({rest1}) are inconsistent with those in " 

856 f"registry's version of this dataset ({rest2})." 

857 ) 

858 else: 

859 # Component dataset types are not explicitly in the 

860 # registry. This complicates consistency checks with 

861 # registry and requires we work out the composite storage 

862 # class. 

863 registryDatasetType = None 

864 try: 

865 registryDatasetType = registry.getDatasetType(c.name) 

866 except KeyError: 

867 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

868 if componentName: 

869 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

870 raise LookupError( 

871 "Component parent class cannot be determined, and " 

872 "composite name was not in storage class mapping, or no " 

873 "storage_class_mapping was supplied" 

874 ) 

875 else: 

876 parentStorageClass = storage_class_mapping[compositeName] 

877 else: 

878 parentStorageClass = None 

879 datasetType = c.makeDatasetType( 

880 registry.dimensions, parentStorageClass=parentStorageClass 

881 ) 

882 registryDatasetType = datasetType 

883 else: 

884 datasetType = c.makeDatasetType( 

885 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

886 ) 

887 

888 if registryDatasetType and datasetType != registryDatasetType: 

889 # The dataset types differ but first check to see if 

890 # they are compatible before raising. 

891 if is_input: 

892 # This DatasetType must be compatible on get. 

893 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

894 else: 

895 # Has to be able to be converted to expect type 

896 # on put. 

897 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

898 if is_compatible: 

899 # For inputs we want the pipeline to use the 

900 # pipeline definition, for outputs it should use 

901 # the registry definition. 

902 if not is_input: 

903 datasetType = registryDatasetType 

904 _LOG.debug( 

905 "Dataset types differ (task %s != registry %s) but are compatible" 

906 " for %s in %s.", 

907 datasetType, 

908 registryDatasetType, 

909 "input" if is_input else "output", 

910 taskDef.label, 

911 ) 

912 else: 

913 try: 

914 # Explicitly check for storage class just to 

915 # make more specific message. 

916 _ = datasetType.storageClass 

917 except KeyError: 

918 raise ValueError( 

919 "Storage class does not exist for supplied dataset type " 

920 f"{datasetType} for {taskDef.label}." 

921 ) from None 

922 raise ValueError( 

923 f"Supplied dataset type ({datasetType}) inconsistent with " 

924 f"registry definition ({registryDatasetType}) " 

925 f"for {taskDef.label}." 

926 ) 

927 datasetTypes.add(datasetType) 

928 if freeze: 

929 datasetTypes.freeze() 

930 return datasetTypes 

931 

932 # optionally add initOutput dataset for config 

933 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

934 if include_configs: 

935 initOutputs.add( 

936 DatasetType( 

937 taskDef.configDatasetName, 

938 registry.dimensions.empty, 

939 storageClass="Config", 

940 ) 

941 ) 

942 initOutputs.freeze() 

943 

944 # optionally add output dataset for metadata 

945 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

946 if taskDef.metadataDatasetName is not None: 

947 # Metadata is supposed to be of the TaskMetadata type, its 

948 # dimensions correspond to a task quantum. 

949 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

950 

951 # Allow the storage class definition to be read from the existing 

952 # dataset type definition if present. 

953 try: 

954 current = registry.getDatasetType(taskDef.metadataDatasetName) 

955 except KeyError: 

956 # No previous definition so use the default. 

957 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

958 else: 

959 storageClass = current.storageClass.name 

960 

961 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)} 

962 if taskDef.logOutputDatasetName is not None: 

963 # Log output dimensions correspond to a task quantum. 

964 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

965 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")} 

966 

967 outputs.freeze() 

968 

969 return cls( 

970 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

971 initOutputs=initOutputs, 

972 inputs=makeDatasetTypesSet("inputs", is_input=True), 

973 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

974 outputs=outputs, 

975 ) 

976 

977 

978@dataclass(frozen=True) 

979class PipelineDatasetTypes: 

980 """An immutable struct that classifies the dataset types used in a 

981 `Pipeline`. 

982 """ 

983 

984 packagesDatasetName: ClassVar[str] = "packages" 

985 """Name of a dataset type used to save package versions. 

986 """ 

987 

988 initInputs: NamedValueSet[DatasetType] 

989 """Dataset types that are needed as inputs in order to construct the Tasks 

990 in this Pipeline. 

991 

992 This does not include dataset types that are produced when constructing 

993 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

994 """ 

995 

996 initOutputs: NamedValueSet[DatasetType] 

997 """Dataset types that may be written after constructing the Tasks in this 

998 Pipeline. 

999 

1000 This does not include dataset types that are also used as inputs when 

1001 constructing other Tasks in the Pipeline (these are classified as 

1002 `initIntermediates`). 

1003 """ 

1004 

1005 initIntermediates: NamedValueSet[DatasetType] 

1006 """Dataset types that are both used when constructing one or more Tasks 

1007 in the Pipeline and produced as a side-effect of constructing another 

1008 Task in the Pipeline. 

1009 """ 

1010 

1011 inputs: NamedValueSet[DatasetType] 

1012 """Dataset types that are regular inputs for the full pipeline. 

1013 

1014 If an input dataset needed for a Quantum cannot be found in the input 

1015 collection(s), that Quantum (and all dependent Quanta) will not be 

1016 produced. 

1017 """ 

1018 

1019 prerequisites: NamedValueSet[DatasetType] 

1020 """Dataset types that are prerequisite inputs for the full Pipeline. 

1021 

1022 Prerequisite inputs must exist in the input collection(s) before the 

1023 pipeline is run, but do not constrain the graph - if a prerequisite is 

1024 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1025 

1026 Prerequisite inputs are not resolved until the second stage of 

1027 QuantumGraph generation. 

1028 """ 

1029 

1030 intermediates: NamedValueSet[DatasetType] 

1031 """Dataset types that are output by one Task in the Pipeline and consumed 

1032 as inputs by one or more other Tasks in the Pipeline. 

1033 """ 

1034 

1035 outputs: NamedValueSet[DatasetType] 

1036 """Dataset types that are output by a Task in the Pipeline and not consumed 

1037 by any other Task in the Pipeline. 

1038 """ 

1039 

1040 byTask: Mapping[str, TaskDatasetTypes] 

1041 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1042 

1043 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1044 neither has been modified since the dataset types were extracted, of 

1045 course). 

1046 """ 

1047 

1048 @classmethod 

1049 def fromPipeline( 

1050 cls, 

1051 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1052 *, 

1053 registry: Registry, 

1054 include_configs: bool = True, 

1055 include_packages: bool = True, 

1056 ) -> PipelineDatasetTypes: 

1057 """Extract and classify the dataset types from all tasks in a 

1058 `Pipeline`. 

1059 

1060 Parameters 

1061 ---------- 

1062 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1063 A collection of tasks that can be run together. 

1064 registry: `Registry` 

1065 Registry used to construct normalized `DatasetType` objects and 

1066 retrieve those that are incomplete. 

1067 include_configs : `bool`, optional 

1068 If `True` (default) include config dataset types as 

1069 ``initOutputs``. 

1070 include_packages : `bool`, optional 

1071 If `True` (default) include the dataset type for software package 

1072 versions in ``initOutputs``. 

1073 

1074 Returns 

1075 ------- 

1076 types: `PipelineDatasetTypes` 

1077 The dataset types used by this `Pipeline`. 

1078 

1079 Raises 

1080 ------ 

1081 ValueError 

1082 Raised if Tasks are inconsistent about which datasets are marked 

1083 prerequisite. This indicates that the Tasks cannot be run as part 

1084 of the same `Pipeline`. 

1085 """ 

1086 allInputs = NamedValueSet() 

1087 allOutputs = NamedValueSet() 

1088 allInitInputs = NamedValueSet() 

1089 allInitOutputs = NamedValueSet() 

1090 prerequisites = NamedValueSet() 

1091 byTask = dict() 

1092 if include_packages: 

1093 allInitOutputs.add( 

1094 DatasetType( 

1095 cls.packagesDatasetName, 

1096 registry.dimensions.empty, 

1097 storageClass="Packages", 

1098 ) 

1099 ) 

1100 # create a list of TaskDefs in case the input is a generator 

1101 pipeline = list(pipeline) 

1102 

1103 # collect all the output dataset types 

1104 typeStorageclassMap: Dict[str, str] = {} 

1105 for taskDef in pipeline: 

1106 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1107 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1108 

1109 for taskDef in pipeline: 

1110 thisTask = TaskDatasetTypes.fromTaskDef( 

1111 taskDef, 

1112 registry=registry, 

1113 include_configs=include_configs, 

1114 storage_class_mapping=typeStorageclassMap, 

1115 ) 

1116 allInitInputs |= thisTask.initInputs 

1117 allInitOutputs |= thisTask.initOutputs 

1118 allInputs |= thisTask.inputs 

1119 prerequisites |= thisTask.prerequisites 

1120 allOutputs |= thisTask.outputs 

1121 byTask[taskDef.label] = thisTask 

1122 if not prerequisites.isdisjoint(allInputs): 

1123 raise ValueError( 

1124 "{} marked as both prerequisites and regular inputs".format( 

1125 {dt.name for dt in allInputs & prerequisites} 

1126 ) 

1127 ) 

1128 if not prerequisites.isdisjoint(allOutputs): 

1129 raise ValueError( 

1130 "{} marked as both prerequisites and outputs".format( 

1131 {dt.name for dt in allOutputs & prerequisites} 

1132 ) 

1133 ) 

1134 # Make sure that components which are marked as inputs get treated as 

1135 # intermediates if there is an output which produces the composite 

1136 # containing the component 

1137 intermediateComponents = NamedValueSet() 

1138 intermediateComposites = NamedValueSet() 

1139 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1140 for dsType in allInputs: 

1141 # get the name of a possible component 

1142 name, component = dsType.nameAndComponent() 

1143 # if there is a component name, that means this is a component 

1144 # DatasetType, if there is an output which produces the parent of 

1145 # this component, treat this input as an intermediate 

1146 if component is not None: 

1147 # This needs to be in this if block, because someone might have 

1148 # a composite that is a pure input from existing data 

1149 if name in outputNameMapping: 

1150 intermediateComponents.add(dsType) 

1151 intermediateComposites.add(outputNameMapping[name]) 

1152 

1153 def checkConsistency(a: NamedValueSet, b: NamedValueSet): 

1154 common = a.names & b.names 

1155 for name in common: 

1156 # Any compatibility is allowed. This function does not know 

1157 # if a dataset type is to be used for input or output. 

1158 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1159 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1160 

1161 checkConsistency(allInitInputs, allInitOutputs) 

1162 checkConsistency(allInputs, allOutputs) 

1163 checkConsistency(allInputs, intermediateComposites) 

1164 checkConsistency(allOutputs, intermediateComposites) 

1165 

1166 def frozen(s: NamedValueSet) -> NamedValueSet: 

1167 s.freeze() 

1168 return s 

1169 

1170 return cls( 

1171 initInputs=frozen(allInitInputs - allInitOutputs), 

1172 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1173 initOutputs=frozen(allInitOutputs - allInitInputs), 

1174 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1175 # If there are storage class differences in inputs and outputs 

1176 # the intermediates have to choose priority. Here choose that 

1177 # inputs to tasks much match the requested storage class by 

1178 # applying the inputs over the top of the outputs. 

1179 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1180 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1181 prerequisites=frozen(prerequisites), 

1182 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1183 ) 

1184 

1185 @classmethod 

1186 def initOutputNames( 

1187 cls, 

1188 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1189 *, 

1190 include_configs: bool = True, 

1191 include_packages: bool = True, 

1192 ) -> Iterator[str]: 

1193 """Return the names of dataset types ot task initOutputs, Configs, 

1194 and package versions for a pipeline. 

1195 

1196 Parameters 

1197 ---------- 

1198 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1199 A `Pipeline` instance or collection of `TaskDef` instances. 

1200 include_configs : `bool`, optional 

1201 If `True` (default) include config dataset types. 

1202 include_packages : `bool`, optional 

1203 If `True` (default) include the dataset type for package versions. 

1204 

1205 Yields 

1206 ------ 

1207 datasetTypeName : `str` 

1208 Name of the dataset type. 

1209 """ 

1210 if include_packages: 

1211 # Package versions dataset type 

1212 yield cls.packagesDatasetName 

1213 

1214 if isinstance(pipeline, Pipeline): 

1215 pipeline = pipeline.toExpandedPipeline() 

1216 

1217 for taskDef in pipeline: 

1218 

1219 # all task InitOutputs 

1220 for name in taskDef.connections.initOutputs: 

1221 attribute = getattr(taskDef.connections, name) 

1222 yield attribute.name 

1223 

1224 # config dataset name 

1225 if include_configs: 

1226 yield taskDef.configDatasetName