Coverage for python/lsst/pipe/base/pipeline.py: 18%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

399 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining Pipeline class and related methods. 

24""" 

25 

26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"] 

27 

28import copy 

29import logging 

30import os 

31import re 

32import urllib.parse 

33import warnings 

34 

35# ------------------------------- 

36# Imports of standard modules -- 

37# ------------------------------- 

38from dataclasses import dataclass 

39from types import MappingProxyType 

40from typing import ( 

41 TYPE_CHECKING, 

42 AbstractSet, 

43 ClassVar, 

44 Dict, 

45 Generator, 

46 Iterable, 

47 Iterator, 

48 Mapping, 

49 Optional, 

50 Set, 

51 Tuple, 

52 Type, 

53 Union, 

54) 

55 

56# ----------------------------- 

57# Imports for other modules -- 

58from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_full_type_name 

62 

63from . import pipelineIR, pipeTools 

64from ._task_metadata import TaskMetadata 

65from .configOverrides import ConfigOverrides 

66from .connections import iterConnections 

67from .pipelineTask import PipelineTask 

68from .task import _TASK_METADATA_TYPE 

69 

70if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 70 ↛ 71line 70 didn't jump to line 71, because the condition on line 70 was never true

71 from lsst.obs.base import Instrument 

72 from lsst.pex.config import Config 

73 

74# ---------------------------------- 

75# Local non-exported definitions -- 

76# ---------------------------------- 

77 

78_LOG = logging.getLogger(__name__) 

79 

80# ------------------------ 

81# Exported definitions -- 

82# ------------------------ 

83 

84 

85@dataclass 

86class LabelSpecifier: 

87 """A structure to specify a subset of labels to load 

88 

89 This structure may contain a set of labels to be used in subsetting a 

90 pipeline, or a beginning and end point. Beginning or end may be empty, 

91 in which case the range will be a half open interval. Unlike python 

92 iteration bounds, end bounds are *INCLUDED*. Note that range based 

93 selection is not well defined for pipelines that are not linear in nature, 

94 and correct behavior is not guaranteed, or may vary from run to run. 

95 """ 

96 

97 labels: Optional[Set[str]] = None 

98 begin: Optional[str] = None 

99 end: Optional[str] = None 

100 

101 def __post_init__(self) -> None: 

102 if self.labels is not None and (self.begin or self.end): 

103 raise ValueError( 

104 "This struct can only be initialized with a labels set or a begin (and/or) end specifier" 

105 ) 

106 

107 

108class TaskDef: 

109 """TaskDef is a collection of information about task needed by Pipeline. 

110 

111 The information includes task name, configuration object and optional 

112 task class. This class is just a collection of attributes and it exposes 

113 all of them so that attributes could potentially be modified in place 

114 (e.g. if configuration needs extra overrides). 

115 

116 Attributes 

117 ---------- 

118 taskName : `str`, optional 

119 `PipelineTask` class name, currently it is not specified whether this 

120 is a fully-qualified name or partial name (e.g. ``module.TaskClass``). 

121 Framework should be prepared to handle all cases. If not provided, 

122 ``taskClass`` must be, and ``taskClass.__name__`` is used. 

123 config : `lsst.pex.config.Config`, optional 

124 Instance of the configuration class corresponding to this task class, 

125 usually with all overrides applied. This config will be frozen. If 

126 not provided, ``taskClass`` must be provided and 

127 ``taskClass.ConfigClass()`` will be used. 

128 taskClass : `type`, optional 

129 `PipelineTask` class object, can be ``None``. If ``None`` then 

130 framework will have to locate and load class. 

131 label : `str`, optional 

132 Task label, usually a short string unique in a pipeline. If not 

133 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will 

134 be used. 

135 """ 

136 

137 def __init__( 

138 self, 

139 taskName: Optional[str] = None, 

140 config: Optional[Config] = None, 

141 taskClass: Optional[Type[PipelineTask]] = None, 

142 label: Optional[str] = None, 

143 ): 

144 if taskName is None: 

145 if taskClass is None: 

146 raise ValueError("At least one of `taskName` and `taskClass` must be provided.") 

147 taskName = taskClass.__name__ 

148 if config is None: 

149 if taskClass is None: 

150 raise ValueError("`taskClass` must be provided if `config` is not.") 

151 config = taskClass.ConfigClass() 

152 if label is None: 

153 if taskClass is None: 

154 raise ValueError("`taskClass` must be provided if `label` is not.") 

155 label = taskClass._DefaultName 

156 self.taskName = taskName 

157 try: 

158 config.validate() 

159 except Exception: 

160 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName) 

161 raise 

162 config.freeze() 

163 self.config = config 

164 self.taskClass = taskClass 

165 self.label = label 

166 self.connections = config.connections.ConnectionsClass(config=config) 

167 

168 @property 

169 def configDatasetName(self) -> str: 

170 """Name of a dataset type for configuration of this task (`str`)""" 

171 return self.label + "_config" 

172 

173 @property 

174 def metadataDatasetName(self) -> Optional[str]: 

175 """Name of a dataset type for metadata of this task, `None` if 

176 metadata is not to be saved (`str`) 

177 """ 

178 if self.config.saveMetadata: 

179 return self.makeMetadataDatasetName(self.label) 

180 else: 

181 return None 

182 

183 @classmethod 

184 def makeMetadataDatasetName(cls, label: str) -> str: 

185 """Construct the name of the dataset type for metadata for a task. 

186 

187 Parameters 

188 ---------- 

189 label : `str` 

190 Label for the task within its pipeline. 

191 

192 Returns 

193 ------- 

194 name : `str` 

195 Name of the task's metadata dataset type. 

196 """ 

197 return f"{label}_metadata" 

198 

199 @property 

200 def logOutputDatasetName(self) -> Optional[str]: 

201 """Name of a dataset type for log output from this task, `None` if 

202 logs are not to be saved (`str`) 

203 """ 

204 if self.config.saveLogOutput: 

205 return self.label + "_log" 

206 else: 

207 return None 

208 

209 def __str__(self) -> str: 

210 rep = "TaskDef(" + self.taskName 

211 if self.label: 

212 rep += ", label=" + self.label 

213 rep += ")" 

214 return rep 

215 

216 def __eq__(self, other: object) -> bool: 

217 if not isinstance(other, TaskDef): 

218 return False 

219 # This does not consider equality of configs when determining equality 

220 # as config equality is a difficult thing to define. Should be updated 

221 # after DM-27847 

222 return self.taskClass == other.taskClass and self.label == other.label 

223 

224 def __hash__(self) -> int: 

225 return hash((self.taskClass, self.label)) 

226 

227 

228class Pipeline: 

229 """A `Pipeline` is a representation of a series of tasks to run, and the 

230 configuration for those tasks. 

231 

232 Parameters 

233 ---------- 

234 description : `str` 

235 A description of that this pipeline does. 

236 """ 

237 

238 def __init__(self, description: str): 

239 pipeline_dict = {"description": description, "tasks": {}} 

240 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict) 

241 

242 @classmethod 

243 def fromFile(cls, filename: str) -> Pipeline: 

244 """Load a pipeline defined in a pipeline yaml file. 

245 

246 Parameters 

247 ---------- 

248 filename: `str` 

249 A path that points to a pipeline defined in yaml format. This 

250 filename may also supply additional labels to be used in 

251 subsetting the loaded Pipeline. These labels are separated from 

252 the path by a \\#, and may be specified as a comma separated 

253 list, or a range denoted as beginning..end. Beginning or end may 

254 be empty, in which case the range will be a half open interval. 

255 Unlike python iteration bounds, end bounds are *INCLUDED*. Note 

256 that range based selection is not well defined for pipelines that 

257 are not linear in nature, and correct behavior is not guaranteed, 

258 or may vary from run to run. 

259 

260 Returns 

261 ------- 

262 pipeline: `Pipeline` 

263 The pipeline loaded from specified location with appropriate (if 

264 any) subsetting 

265 

266 Notes 

267 ----- 

268 This method attempts to prune any contracts that contain labels which 

269 are not in the declared subset of labels. This pruning is done using a 

270 string based matching due to the nature of contracts and may prune more 

271 than it should. 

272 """ 

273 return cls.from_uri(filename) 

274 

275 @classmethod 

276 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline: 

277 """Load a pipeline defined in a pipeline yaml file at a location 

278 specified by a URI. 

279 

280 Parameters 

281 ---------- 

282 uri: convertible to `ResourcePath` 

283 If a string is supplied this should be a URI path that points to a 

284 pipeline defined in yaml format, either as a direct path to the 

285 yaml file, or as a directory containing a "pipeline.yaml" file (the 

286 form used by `write_to_uri` with ``expand=True``). This uri may 

287 also supply additional labels to be used in subsetting the loaded 

288 Pipeline. These labels are separated from the path by a \\#, and 

289 may be specified as a comma separated list, or a range denoted as 

290 beginning..end. Beginning or end may be empty, in which case the 

291 range will be a half open interval. Unlike python iteration bounds, 

292 end bounds are *INCLUDED*. Note that range based selection is not 

293 well defined for pipelines that are not linear in nature, and 

294 correct behavior is not guaranteed, or may vary from run to run. 

295 The same specifiers can be used with a `ResourcePath` object, by 

296 being the sole contents in the fragments attribute. 

297 

298 Returns 

299 ------- 

300 pipeline: `Pipeline` 

301 The pipeline loaded from specified location with appropriate (if 

302 any) subsetting 

303 

304 Notes 

305 ----- 

306 This method attempts to prune any contracts that contain labels which 

307 are not in the declared subset of labels. This pruning is done using a 

308 string based matching due to the nature of contracts and may prune more 

309 than it should. 

310 """ 

311 # Split up the uri and any labels that were supplied 

312 uri, label_specifier = cls._parse_file_specifier(uri) 

313 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri)) 

314 

315 # If there are labels supplied, only keep those 

316 if label_specifier is not None: 

317 pipeline = pipeline.subsetFromLabels(label_specifier) 

318 return pipeline 

319 

320 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline: 

321 """Subset a pipeline to contain only labels specified in labelSpecifier 

322 

323 Parameters 

324 ---------- 

325 labelSpecifier : `labelSpecifier` 

326 Object containing labels that describes how to subset a pipeline. 

327 

328 Returns 

329 ------- 

330 pipeline : `Pipeline` 

331 A new pipeline object that is a subset of the old pipeline 

332 

333 Raises 

334 ------ 

335 ValueError 

336 Raised if there is an issue with specified labels 

337 

338 Notes 

339 ----- 

340 This method attempts to prune any contracts that contain labels which 

341 are not in the declared subset of labels. This pruning is done using a 

342 string based matching due to the nature of contracts and may prune more 

343 than it should. 

344 """ 

345 # Labels supplied as a set 

346 if labelSpecifier.labels: 

347 labelSet = labelSpecifier.labels 

348 # Labels supplied as a range, first create a list of all the labels 

349 # in the pipeline sorted according to task dependency. Then only 

350 # keep labels that lie between the supplied bounds 

351 else: 

352 # Create a copy of the pipeline to use when assessing the label 

353 # ordering. Use a dict for fast searching while preserving order. 

354 # Remove contracts so they do not fail in the expansion step. This 

355 # is needed because a user may only configure the tasks they intend 

356 # to run, which may cause some contracts to fail if they will later 

357 # be dropped 

358 pipeline = copy.deepcopy(self) 

359 pipeline._pipelineIR.contracts = [] 

360 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()} 

361 

362 # Verify the bounds are in the labels 

363 if labelSpecifier.begin is not None: 

364 if labelSpecifier.begin not in labels: 

365 raise ValueError( 

366 f"Beginning of range subset, {labelSpecifier.begin}, not found in " 

367 "pipeline definition" 

368 ) 

369 if labelSpecifier.end is not None: 

370 if labelSpecifier.end not in labels: 

371 raise ValueError( 

372 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition" 

373 ) 

374 

375 labelSet = set() 

376 for label in labels: 

377 if labelSpecifier.begin is not None: 

378 if label != labelSpecifier.begin: 

379 continue 

380 else: 

381 labelSpecifier.begin = None 

382 labelSet.add(label) 

383 if labelSpecifier.end is not None and label == labelSpecifier.end: 

384 break 

385 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet)) 

386 

387 @staticmethod 

388 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]: 

389 """Split appart a uri and any possible label subsets""" 

390 if isinstance(uri, str): 

391 # This is to support legacy pipelines during transition 

392 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri) 

393 if num_replace: 

394 warnings.warn( 

395 f"The pipeline file {uri} seems to use the legacy : to separate " 

396 "labels, this is deprecated and will be removed after June 2021, please use " 

397 "# instead.", 

398 category=FutureWarning, 

399 ) 

400 if uri.count("#") > 1: 

401 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load") 

402 # Everything else can be converted directly to ResourcePath. 

403 uri = ResourcePath(uri) 

404 label_subset = uri.fragment or None 

405 

406 specifier: Optional[LabelSpecifier] 

407 if label_subset is not None: 

408 label_subset = urllib.parse.unquote(label_subset) 

409 args: Dict[str, Union[Set[str], str, None]] 

410 # labels supplied as a list 

411 if "," in label_subset: 

412 if ".." in label_subset: 

413 raise ValueError( 

414 "Can only specify a list of labels or a rangewhen loading a Pipline not both" 

415 ) 

416 args = {"labels": set(label_subset.split(","))} 

417 # labels supplied as a range 

418 elif ".." in label_subset: 

419 # Try to de-structure the labelSubset, this will fail if more 

420 # than one range is specified 

421 begin, end, *rest = label_subset.split("..") 

422 if rest: 

423 raise ValueError("Only one range can be specified when loading a pipeline") 

424 args = {"begin": begin if begin else None, "end": end if end else None} 

425 # Assume anything else is a single label 

426 else: 

427 args = {"labels": {label_subset}} 

428 

429 # MyPy doesn't like how cavalier kwarg construction is with types. 

430 specifier = LabelSpecifier(**args) # type: ignore 

431 else: 

432 specifier = None 

433 

434 return uri, specifier 

435 

436 @classmethod 

437 def fromString(cls, pipeline_string: str) -> Pipeline: 

438 """Create a pipeline from string formatted as a pipeline document. 

439 

440 Parameters 

441 ---------- 

442 pipeline_string : `str` 

443 A string that is formatted according like a pipeline document 

444 

445 Returns 

446 ------- 

447 pipeline: `Pipeline` 

448 """ 

449 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string)) 

450 return pipeline 

451 

452 @classmethod 

453 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline: 

454 """Create a pipeline from an already created `PipelineIR` object. 

455 

456 Parameters 

457 ---------- 

458 deserialized_pipeline: `PipelineIR` 

459 An already created pipeline intermediate representation object 

460 

461 Returns 

462 ------- 

463 pipeline: `Pipeline` 

464 """ 

465 pipeline = cls.__new__(cls) 

466 pipeline._pipelineIR = deserialized_pipeline 

467 return pipeline 

468 

469 @classmethod 

470 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline: 

471 """Create a new pipeline by copying an already existing `Pipeline`. 

472 

473 Parameters 

474 ---------- 

475 pipeline: `Pipeline` 

476 An already created pipeline intermediate representation object 

477 

478 Returns 

479 ------- 

480 pipeline: `Pipeline` 

481 """ 

482 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR)) 

483 

484 def __str__(self) -> str: 

485 # tasks need sorted each call because someone might have added or 

486 # removed task, and caching changes does not seem worth the small 

487 # overhead 

488 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

489 self._pipelineIR.reorder_tasks(labels) 

490 return str(self._pipelineIR) 

491 

492 def addInstrument(self, instrument: Union[Instrument, str]) -> None: 

493 """Add an instrument to the pipeline, or replace an instrument that is 

494 already defined. 

495 

496 Parameters 

497 ---------- 

498 instrument : `~lsst.daf.butler.instrument.Instrument` or `str` 

499 Either a derived class object of a `lsst.daf.butler.instrument` or 

500 a string corresponding to a fully qualified 

501 `lsst.daf.butler.instrument` name. 

502 """ 

503 if isinstance(instrument, str): 

504 pass 

505 else: 

506 # TODO: assume that this is a subclass of Instrument, no type 

507 # checking 

508 instrument = get_full_type_name(instrument) 

509 self._pipelineIR.instrument = instrument 

510 

511 def getInstrument(self) -> Optional[str]: 

512 """Get the instrument from the pipeline. 

513 

514 Returns 

515 ------- 

516 instrument : `str`, or None 

517 The fully qualified name of a `lsst.obs.base.Instrument` subclass, 

518 name, or None if the pipeline does not have an instrument. 

519 """ 

520 return self._pipelineIR.instrument 

521 

522 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None: 

523 """Add a new task to the pipeline, or replace a task that is already 

524 associated with the supplied label. 

525 

526 Parameters 

527 ---------- 

528 task: `PipelineTask` or `str` 

529 Either a derived class object of a `PipelineTask` or a string 

530 corresponding to a fully qualified `PipelineTask` name. 

531 label: `str` 

532 A label that is used to identify the `PipelineTask` being added 

533 """ 

534 if isinstance(task, str): 

535 taskName = task 

536 elif issubclass(task, PipelineTask): 

537 taskName = get_full_type_name(task) 

538 else: 

539 raise ValueError( 

540 "task must be either a child class of PipelineTask or a string containing" 

541 " a fully qualified name to one" 

542 ) 

543 if not label: 

544 # in some cases (with command line-generated pipeline) tasks can 

545 # be defined without label which is not acceptable, use task 

546 # _DefaultName in that case 

547 if isinstance(task, str): 

548 task_class = doImportType(task) 

549 label = task_class._DefaultName 

550 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName) 

551 

552 def removeTask(self, label: str) -> None: 

553 """Remove a task from the pipeline. 

554 

555 Parameters 

556 ---------- 

557 label : `str` 

558 The label used to identify the task that is to be removed 

559 

560 Raises 

561 ------ 

562 KeyError 

563 If no task with that label exists in the pipeline 

564 

565 """ 

566 self._pipelineIR.tasks.pop(label) 

567 

568 def addConfigOverride(self, label: str, key: str, value: object) -> None: 

569 """Apply single config override. 

570 

571 Parameters 

572 ---------- 

573 label : `str` 

574 Label of the task. 

575 key: `str` 

576 Fully-qualified field name. 

577 value : object 

578 Value to be given to a field. 

579 """ 

580 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value})) 

581 

582 def addConfigFile(self, label: str, filename: str) -> None: 

583 """Add overrides from a specified file. 

584 

585 Parameters 

586 ---------- 

587 label : `str` 

588 The label used to identify the task associated with config to 

589 modify 

590 filename : `str` 

591 Path to the override file. 

592 """ 

593 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename])) 

594 

595 def addConfigPython(self, label: str, pythonString: str) -> None: 

596 """Add Overrides by running a snippet of python code against a config. 

597 

598 Parameters 

599 ---------- 

600 label : `str` 

601 The label used to identity the task associated with config to 

602 modify. 

603 pythonString: `str` 

604 A string which is valid python code to be executed. This is done 

605 with config as the only local accessible value. 

606 """ 

607 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString)) 

608 

609 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None: 

610 if label == "parameters": 

611 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys(): 

612 raise ValueError("Cannot override parameters that are not defined in pipeline") 

613 self._pipelineIR.parameters.mapping.update(newConfig.rest) 

614 if newConfig.file: 

615 raise ValueError("Setting parameters section with config file is not supported") 

616 if newConfig.python: 

617 raise ValueError("Setting parameters section using python block in unsupported") 

618 return 

619 if label not in self._pipelineIR.tasks: 

620 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline") 

621 self._pipelineIR.tasks[label].add_or_update_config(newConfig) 

622 

623 def toFile(self, filename: str) -> None: 

624 self._pipelineIR.to_file(filename) 

625 

626 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

627 """Write the pipeline to a file or directory. 

628 

629 Parameters 

630 ---------- 

631 uri : convertible to `ResourcePath` 

632 URI to write to; may have any scheme with `ResourcePath` write 

633 support or no scheme for a local file/directory. Should have a 

634 ``.yaml``. 

635 """ 

636 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)] 

637 self._pipelineIR.reorder_tasks(labels) 

638 self._pipelineIR.write_to_uri(uri) 

639 

640 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]: 

641 """Returns a generator of TaskDefs which can be used to create quantum 

642 graphs. 

643 

644 Returns 

645 ------- 

646 generator : generator of `TaskDef` 

647 The generator returned will be the sorted iterator of tasks which 

648 are to be used in constructing a quantum graph. 

649 

650 Raises 

651 ------ 

652 NotImplementedError 

653 If a dataId is supplied in a config block. This is in place for 

654 future use 

655 """ 

656 yield from self._toExpandedPipelineImpl() 

657 

658 def _toExpandedPipelineImpl(self, checkContracts: bool = True) -> Iterable[TaskDef]: 

659 taskDefs = [] 

660 for label in self._pipelineIR.tasks: 

661 taskDefs.append(self._buildTaskDef(label)) 

662 

663 # lets evaluate the contracts 

664 if self._pipelineIR.contracts is not None: 

665 label_to_config = {x.label: x.config for x in taskDefs} 

666 for contract in self._pipelineIR.contracts: 

667 # execute this in its own line so it can raise a good error 

668 # message if there was problems with the eval 

669 success = eval(contract.contract, None, label_to_config) 

670 if not success: 

671 extra_info = f": {contract.msg}" if contract.msg is not None else "" 

672 raise pipelineIR.ContractError( 

673 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}" 

674 ) 

675 

676 taskDefs = sorted(taskDefs, key=lambda x: x.label) 

677 yield from pipeTools.orderPipeline(taskDefs) 

678 

679 def _buildTaskDef(self, label: str) -> TaskDef: 

680 if (taskIR := self._pipelineIR.tasks.get(label)) is None: 

681 raise NameError(f"Label {label} does not appear in this pipeline") 

682 taskClass: Type[PipelineTask] = doImportType(taskIR.klass) 

683 taskName = taskClass.__qualname__ 

684 config = taskClass.ConfigClass() 

685 overrides = ConfigOverrides() 

686 if self._pipelineIR.instrument is not None: 

687 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName) 

688 if taskIR.config is not None: 

689 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config): 

690 if configIR.dataId is not None: 

691 raise NotImplementedError( 

692 "Specializing a config on a partial data id is not yet " 

693 "supported in Pipeline definition" 

694 ) 

695 # only apply override if it applies to everything 

696 if configIR.dataId is None: 

697 if configIR.file: 

698 for configFile in configIR.file: 

699 overrides.addFileOverride(os.path.expandvars(configFile)) 

700 if configIR.python is not None: 

701 overrides.addPythonOverride(configIR.python) 

702 for key, value in configIR.rest.items(): 

703 overrides.addValueOverride(key, value) 

704 overrides.applyTo(config) 

705 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label) 

706 

707 def __iter__(self) -> Generator[TaskDef, None, None]: 

708 return self.toExpandedPipeline() 

709 

710 def __getitem__(self, item: str) -> TaskDef: 

711 return self._buildTaskDef(item) 

712 

713 def __len__(self) -> int: 

714 return len(self._pipelineIR.tasks) 

715 

716 def __eq__(self, other: object) -> bool: 

717 if not isinstance(other, Pipeline): 

718 return False 

719 return self._pipelineIR == other._pipelineIR 

720 

721 

722@dataclass(frozen=True) 

723class TaskDatasetTypes: 

724 """An immutable struct that extracts and classifies the dataset types used 

725 by a `PipelineTask` 

726 """ 

727 

728 initInputs: NamedValueSet[DatasetType] 

729 """Dataset types that are needed as inputs in order to construct this Task. 

730 

731 Task-level `initInputs` may be classified as either 

732 `~PipelineDatasetTypes.initInputs` or 

733 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

734 """ 

735 

736 initOutputs: NamedValueSet[DatasetType] 

737 """Dataset types that may be written after constructing this Task. 

738 

739 Task-level `initOutputs` may be classified as either 

740 `~PipelineDatasetTypes.initOutputs` or 

741 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level. 

742 """ 

743 

744 inputs: NamedValueSet[DatasetType] 

745 """Dataset types that are regular inputs to this Task. 

746 

747 If an input dataset needed for a Quantum cannot be found in the input 

748 collection(s) or produced by another Task in the Pipeline, that Quantum 

749 (and all dependent Quanta) will not be produced. 

750 

751 Task-level `inputs` may be classified as either 

752 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates` 

753 at the Pipeline level. 

754 """ 

755 

756 prerequisites: NamedValueSet[DatasetType] 

757 """Dataset types that are prerequisite inputs to this Task. 

758 

759 Prerequisite inputs must exist in the input collection(s) before the 

760 pipeline is run, but do not constrain the graph - if a prerequisite is 

761 missing for a Quantum, `PrerequisiteMissingError` is raised. 

762 

763 Prerequisite inputs are not resolved until the second stage of 

764 QuantumGraph generation. 

765 """ 

766 

767 outputs: NamedValueSet[DatasetType] 

768 """Dataset types that are produced by this Task. 

769 

770 Task-level `outputs` may be classified as either 

771 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates` 

772 at the Pipeline level. 

773 """ 

774 

775 @classmethod 

776 def fromTaskDef( 

777 cls, 

778 taskDef: TaskDef, 

779 *, 

780 registry: Registry, 

781 include_configs: bool = True, 

782 storage_class_mapping: Optional[Mapping[str, str]] = None, 

783 ) -> TaskDatasetTypes: 

784 """Extract and classify the dataset types from a single `PipelineTask`. 

785 

786 Parameters 

787 ---------- 

788 taskDef: `TaskDef` 

789 An instance of a `TaskDef` class for a particular `PipelineTask`. 

790 registry: `Registry` 

791 Registry used to construct normalized `DatasetType` objects and 

792 retrieve those that are incomplete. 

793 include_configs : `bool`, optional 

794 If `True` (default) include config dataset types as 

795 ``initOutputs``. 

796 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional 

797 If a taskdef contains a component dataset type that is unknown 

798 to the registry, its parent StorageClass will be looked up in this 

799 mapping if it is supplied. If the mapping does not contain the 

800 composite dataset type, or the mapping is not supplied an exception 

801 will be raised. 

802 

803 Returns 

804 ------- 

805 types: `TaskDatasetTypes` 

806 The dataset types used by this task. 

807 

808 Raises 

809 ------ 

810 ValueError 

811 Raised if dataset type connection definition differs from 

812 registry definition. 

813 LookupError 

814 Raised if component parent StorageClass could not be determined 

815 and storage_class_mapping does not contain the composite type, or 

816 is set to None. 

817 """ 

818 

819 def makeDatasetTypesSet( 

820 connectionType: str, 

821 is_input: bool, 

822 freeze: bool = True, 

823 ) -> NamedValueSet[DatasetType]: 

824 """Constructs a set of true `DatasetType` objects 

825 

826 Parameters 

827 ---------- 

828 connectionType : `str` 

829 Name of the connection type to produce a set for, corresponds 

830 to an attribute of type `list` on the connection class instance 

831 is_input : `bool` 

832 These are input dataset types, else they are output dataset 

833 types. 

834 freeze : `bool`, optional 

835 If `True`, call `NamedValueSet.freeze` on the object returned. 

836 

837 Returns 

838 ------- 

839 datasetTypes : `NamedValueSet` 

840 A set of all datasetTypes which correspond to the input 

841 connection type specified in the connection class of this 

842 `PipelineTask` 

843 

844 Raises 

845 ------ 

846 ValueError 

847 Raised if dataset type connection definition differs from 

848 registry definition. 

849 LookupError 

850 Raised if component parent StorageClass could not be determined 

851 and storage_class_mapping does not contain the composite type, 

852 or is set to None. 

853 

854 Notes 

855 ----- 

856 This function is a closure over the variables ``registry`` and 

857 ``taskDef``, and ``storage_class_mapping``. 

858 """ 

859 datasetTypes = NamedValueSet[DatasetType]() 

860 for c in iterConnections(taskDef.connections, connectionType): 

861 dimensions = set(getattr(c, "dimensions", set())) 

862 if "skypix" in dimensions: 

863 try: 

864 datasetType = registry.getDatasetType(c.name) 

865 except LookupError as err: 

866 raise LookupError( 

867 f"DatasetType '{c.name}' referenced by " 

868 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension " 

869 f"placeholder, but does not already exist in the registry. " 

870 f"Note that reference catalog names are now used as the dataset " 

871 f"type name instead of 'ref_cat'." 

872 ) from err 

873 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names) 

874 rest2 = set( 

875 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension) 

876 ) 

877 if rest1 != rest2: 

878 raise ValueError( 

879 f"Non-skypix dimensions for dataset type {c.name} declared in " 

880 f"connections ({rest1}) are inconsistent with those in " 

881 f"registry's version of this dataset ({rest2})." 

882 ) 

883 else: 

884 # Component dataset types are not explicitly in the 

885 # registry. This complicates consistency checks with 

886 # registry and requires we work out the composite storage 

887 # class. 

888 registryDatasetType = None 

889 try: 

890 registryDatasetType = registry.getDatasetType(c.name) 

891 except KeyError: 

892 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name) 

893 if componentName: 

894 if storage_class_mapping is None or compositeName not in storage_class_mapping: 

895 raise LookupError( 

896 "Component parent class cannot be determined, and " 

897 "composite name was not in storage class mapping, or no " 

898 "storage_class_mapping was supplied" 

899 ) 

900 else: 

901 parentStorageClass = storage_class_mapping[compositeName] 

902 else: 

903 parentStorageClass = None 

904 datasetType = c.makeDatasetType( 

905 registry.dimensions, parentStorageClass=parentStorageClass 

906 ) 

907 registryDatasetType = datasetType 

908 else: 

909 datasetType = c.makeDatasetType( 

910 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass 

911 ) 

912 

913 if registryDatasetType and datasetType != registryDatasetType: 

914 # The dataset types differ but first check to see if 

915 # they are compatible before raising. 

916 if is_input: 

917 # This DatasetType must be compatible on get. 

918 is_compatible = datasetType.is_compatible_with(registryDatasetType) 

919 else: 

920 # Has to be able to be converted to expect type 

921 # on put. 

922 is_compatible = registryDatasetType.is_compatible_with(datasetType) 

923 if is_compatible: 

924 # For inputs we want the pipeline to use the 

925 # pipeline definition, for outputs it should use 

926 # the registry definition. 

927 if not is_input: 

928 datasetType = registryDatasetType 

929 _LOG.debug( 

930 "Dataset types differ (task %s != registry %s) but are compatible" 

931 " for %s in %s.", 

932 datasetType, 

933 registryDatasetType, 

934 "input" if is_input else "output", 

935 taskDef.label, 

936 ) 

937 else: 

938 try: 

939 # Explicitly check for storage class just to 

940 # make more specific message. 

941 _ = datasetType.storageClass 

942 except KeyError: 

943 raise ValueError( 

944 "Storage class does not exist for supplied dataset type " 

945 f"{datasetType} for {taskDef.label}." 

946 ) from None 

947 raise ValueError( 

948 f"Supplied dataset type ({datasetType}) inconsistent with " 

949 f"registry definition ({registryDatasetType}) " 

950 f"for {taskDef.label}." 

951 ) 

952 datasetTypes.add(datasetType) 

953 if freeze: 

954 datasetTypes.freeze() 

955 return datasetTypes 

956 

957 # optionally add initOutput dataset for config 

958 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False) 

959 if include_configs: 

960 initOutputs.add( 

961 DatasetType( 

962 taskDef.configDatasetName, 

963 registry.dimensions.empty, 

964 storageClass="Config", 

965 ) 

966 ) 

967 initOutputs.freeze() 

968 

969 # optionally add output dataset for metadata 

970 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False) 

971 if taskDef.metadataDatasetName is not None: 

972 # Metadata is supposed to be of the TaskMetadata type, its 

973 # dimensions correspond to a task quantum. 

974 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

975 

976 # Allow the storage class definition to be read from the existing 

977 # dataset type definition if present. 

978 try: 

979 current = registry.getDatasetType(taskDef.metadataDatasetName) 

980 except KeyError: 

981 # No previous definition so use the default. 

982 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet" 

983 else: 

984 storageClass = current.storageClass.name 

985 

986 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}) 

987 if taskDef.logOutputDatasetName is not None: 

988 # Log output dimensions correspond to a task quantum. 

989 dimensions = registry.dimensions.extract(taskDef.connections.dimensions) 

990 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}) 

991 

992 outputs.freeze() 

993 

994 return cls( 

995 initInputs=makeDatasetTypesSet("initInputs", is_input=True), 

996 initOutputs=initOutputs, 

997 inputs=makeDatasetTypesSet("inputs", is_input=True), 

998 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True), 

999 outputs=outputs, 

1000 ) 

1001 

1002 

1003@dataclass(frozen=True) 

1004class PipelineDatasetTypes: 

1005 """An immutable struct that classifies the dataset types used in a 

1006 `Pipeline`. 

1007 """ 

1008 

1009 packagesDatasetName: ClassVar[str] = "packages" 

1010 """Name of a dataset type used to save package versions. 

1011 """ 

1012 

1013 initInputs: NamedValueSet[DatasetType] 

1014 """Dataset types that are needed as inputs in order to construct the Tasks 

1015 in this Pipeline. 

1016 

1017 This does not include dataset types that are produced when constructing 

1018 other Tasks in the Pipeline (these are classified as `initIntermediates`). 

1019 """ 

1020 

1021 initOutputs: NamedValueSet[DatasetType] 

1022 """Dataset types that may be written after constructing the Tasks in this 

1023 Pipeline. 

1024 

1025 This does not include dataset types that are also used as inputs when 

1026 constructing other Tasks in the Pipeline (these are classified as 

1027 `initIntermediates`). 

1028 """ 

1029 

1030 initIntermediates: NamedValueSet[DatasetType] 

1031 """Dataset types that are both used when constructing one or more Tasks 

1032 in the Pipeline and produced as a side-effect of constructing another 

1033 Task in the Pipeline. 

1034 """ 

1035 

1036 inputs: NamedValueSet[DatasetType] 

1037 """Dataset types that are regular inputs for the full pipeline. 

1038 

1039 If an input dataset needed for a Quantum cannot be found in the input 

1040 collection(s), that Quantum (and all dependent Quanta) will not be 

1041 produced. 

1042 """ 

1043 

1044 prerequisites: NamedValueSet[DatasetType] 

1045 """Dataset types that are prerequisite inputs for the full Pipeline. 

1046 

1047 Prerequisite inputs must exist in the input collection(s) before the 

1048 pipeline is run, but do not constrain the graph - if a prerequisite is 

1049 missing for a Quantum, `PrerequisiteMissingError` is raised. 

1050 

1051 Prerequisite inputs are not resolved until the second stage of 

1052 QuantumGraph generation. 

1053 """ 

1054 

1055 intermediates: NamedValueSet[DatasetType] 

1056 """Dataset types that are output by one Task in the Pipeline and consumed 

1057 as inputs by one or more other Tasks in the Pipeline. 

1058 """ 

1059 

1060 outputs: NamedValueSet[DatasetType] 

1061 """Dataset types that are output by a Task in the Pipeline and not consumed 

1062 by any other Task in the Pipeline. 

1063 """ 

1064 

1065 byTask: Mapping[str, TaskDatasetTypes] 

1066 """Per-Task dataset types, keyed by label in the `Pipeline`. 

1067 

1068 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming 

1069 neither has been modified since the dataset types were extracted, of 

1070 course). 

1071 """ 

1072 

1073 @classmethod 

1074 def fromPipeline( 

1075 cls, 

1076 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1077 *, 

1078 registry: Registry, 

1079 include_configs: bool = True, 

1080 include_packages: bool = True, 

1081 ) -> PipelineDatasetTypes: 

1082 """Extract and classify the dataset types from all tasks in a 

1083 `Pipeline`. 

1084 

1085 Parameters 

1086 ---------- 

1087 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1088 A collection of tasks that can be run together. 

1089 registry: `Registry` 

1090 Registry used to construct normalized `DatasetType` objects and 

1091 retrieve those that are incomplete. 

1092 include_configs : `bool`, optional 

1093 If `True` (default) include config dataset types as 

1094 ``initOutputs``. 

1095 include_packages : `bool`, optional 

1096 If `True` (default) include the dataset type for software package 

1097 versions in ``initOutputs``. 

1098 

1099 Returns 

1100 ------- 

1101 types: `PipelineDatasetTypes` 

1102 The dataset types used by this `Pipeline`. 

1103 

1104 Raises 

1105 ------ 

1106 ValueError 

1107 Raised if Tasks are inconsistent about which datasets are marked 

1108 prerequisite. This indicates that the Tasks cannot be run as part 

1109 of the same `Pipeline`. 

1110 """ 

1111 allInputs = NamedValueSet[DatasetType]() 

1112 allOutputs = NamedValueSet[DatasetType]() 

1113 allInitInputs = NamedValueSet[DatasetType]() 

1114 allInitOutputs = NamedValueSet[DatasetType]() 

1115 prerequisites = NamedValueSet[DatasetType]() 

1116 byTask = dict() 

1117 if include_packages: 

1118 allInitOutputs.add( 

1119 DatasetType( 

1120 cls.packagesDatasetName, 

1121 registry.dimensions.empty, 

1122 storageClass="Packages", 

1123 ) 

1124 ) 

1125 # create a list of TaskDefs in case the input is a generator 

1126 pipeline = list(pipeline) 

1127 

1128 # collect all the output dataset types 

1129 typeStorageclassMap: Dict[str, str] = {} 

1130 for taskDef in pipeline: 

1131 for outConnection in iterConnections(taskDef.connections, "outputs"): 

1132 typeStorageclassMap[outConnection.name] = outConnection.storageClass 

1133 

1134 for taskDef in pipeline: 

1135 thisTask = TaskDatasetTypes.fromTaskDef( 

1136 taskDef, 

1137 registry=registry, 

1138 include_configs=include_configs, 

1139 storage_class_mapping=typeStorageclassMap, 

1140 ) 

1141 allInitInputs.update(thisTask.initInputs) 

1142 allInitOutputs.update(thisTask.initOutputs) 

1143 allInputs.update(thisTask.inputs) 

1144 prerequisites.update(thisTask.prerequisites) 

1145 allOutputs.update(thisTask.outputs) 

1146 byTask[taskDef.label] = thisTask 

1147 if not prerequisites.isdisjoint(allInputs): 

1148 raise ValueError( 

1149 "{} marked as both prerequisites and regular inputs".format( 

1150 {dt.name for dt in allInputs & prerequisites} 

1151 ) 

1152 ) 

1153 if not prerequisites.isdisjoint(allOutputs): 

1154 raise ValueError( 

1155 "{} marked as both prerequisites and outputs".format( 

1156 {dt.name for dt in allOutputs & prerequisites} 

1157 ) 

1158 ) 

1159 # Make sure that components which are marked as inputs get treated as 

1160 # intermediates if there is an output which produces the composite 

1161 # containing the component 

1162 intermediateComponents = NamedValueSet[DatasetType]() 

1163 intermediateComposites = NamedValueSet[DatasetType]() 

1164 outputNameMapping = {dsType.name: dsType for dsType in allOutputs} 

1165 for dsType in allInputs: 

1166 # get the name of a possible component 

1167 name, component = dsType.nameAndComponent() 

1168 # if there is a component name, that means this is a component 

1169 # DatasetType, if there is an output which produces the parent of 

1170 # this component, treat this input as an intermediate 

1171 if component is not None: 

1172 # This needs to be in this if block, because someone might have 

1173 # a composite that is a pure input from existing data 

1174 if name in outputNameMapping: 

1175 intermediateComponents.add(dsType) 

1176 intermediateComposites.add(outputNameMapping[name]) 

1177 

1178 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None: 

1179 common = a.names & b.names 

1180 for name in common: 

1181 # Any compatibility is allowed. This function does not know 

1182 # if a dataset type is to be used for input or output. 

1183 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])): 

1184 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.") 

1185 

1186 checkConsistency(allInitInputs, allInitOutputs) 

1187 checkConsistency(allInputs, allOutputs) 

1188 checkConsistency(allInputs, intermediateComposites) 

1189 checkConsistency(allOutputs, intermediateComposites) 

1190 

1191 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]: 

1192 assert isinstance(s, NamedValueSet) 

1193 s.freeze() 

1194 return s 

1195 

1196 return cls( 

1197 initInputs=frozen(allInitInputs - allInitOutputs), 

1198 initIntermediates=frozen(allInitInputs & allInitOutputs), 

1199 initOutputs=frozen(allInitOutputs - allInitInputs), 

1200 inputs=frozen(allInputs - allOutputs - intermediateComponents), 

1201 # If there are storage class differences in inputs and outputs 

1202 # the intermediates have to choose priority. Here choose that 

1203 # inputs to tasks much match the requested storage class by 

1204 # applying the inputs over the top of the outputs. 

1205 intermediates=frozen(allOutputs & allInputs | intermediateComponents), 

1206 outputs=frozen(allOutputs - allInputs - intermediateComposites), 

1207 prerequisites=frozen(prerequisites), 

1208 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability 

1209 ) 

1210 

1211 @classmethod 

1212 def initOutputNames( 

1213 cls, 

1214 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1215 *, 

1216 include_configs: bool = True, 

1217 include_packages: bool = True, 

1218 ) -> Iterator[str]: 

1219 """Return the names of dataset types ot task initOutputs, Configs, 

1220 and package versions for a pipeline. 

1221 

1222 Parameters 

1223 ---------- 

1224 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ] 

1225 A `Pipeline` instance or collection of `TaskDef` instances. 

1226 include_configs : `bool`, optional 

1227 If `True` (default) include config dataset types. 

1228 include_packages : `bool`, optional 

1229 If `True` (default) include the dataset type for package versions. 

1230 

1231 Yields 

1232 ------ 

1233 datasetTypeName : `str` 

1234 Name of the dataset type. 

1235 """ 

1236 if include_packages: 

1237 # Package versions dataset type 

1238 yield cls.packagesDatasetName 

1239 

1240 if isinstance(pipeline, Pipeline): 

1241 pipeline = pipeline.toExpandedPipeline() 

1242 

1243 for taskDef in pipeline: 

1244 

1245 # all task InitOutputs 

1246 for name in taskDef.connections.initOutputs: 

1247 attribute = getattr(taskDef.connections, name) 

1248 yield attribute.name 

1249 

1250 # config dataset name 

1251 if include_configs: 

1252 yield taskDef.configDatasetName