Coverage for python/lsst/pipe/base/pipelineIR.py: 21%

407 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-11 17:45 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ( 

30 "ConfigIR", 

31 "ContractError", 

32 "ContractIR", 

33 "ImportIR", 

34 "LabeledSubset", 

35 "ParametersIR", 

36 "PipelineIR", 

37 "TaskIR", 

38) 

39 

40import copy 

41import enum 

42import os 

43import re 

44import warnings 

45from collections import Counter 

46from collections.abc import Generator, Hashable, Iterable, MutableMapping 

47from dataclasses import dataclass, field 

48from typing import Any, Literal 

49 

50import yaml 

51from lsst.resources import ResourcePath, ResourcePathExpression 

52from lsst.utils.introspection import find_outside_stacklevel 

53 

54 

55class PipelineSubsetCtrl(enum.Enum): 

56 """An Enumeration of the various ways a pipeline subsetting operation will 

57 handle labeled subsets when task labels they defined are missing. 

58 """ 

59 

60 DROP = enum.auto() 

61 """Drop any subsets that contain labels which are no longer in the set of 

62 task labels when subsetting an entire pipeline 

63 """ 

64 EDIT = enum.auto() 

65 """Edit any subsets that contain labels which are no longer in the set of 

66 task labels to remove the missing label, but leave the subset when 

67 subsetting a pipeline. 

68 """ 

69 

70 

71class _Tags(enum.Enum): 

72 KeepInstrument = enum.auto() 

73 

74 

75class PipelineYamlLoader(yaml.SafeLoader): 

76 """Specialized version of yaml's SafeLoader. 

77 

78 It checks and raises an exception if it finds that there are multiple 

79 instances of the same key found inside a pipeline file at a given scope. 

80 """ 

81 

82 def construct_mapping(self, node: yaml.MappingNode, deep: bool = False) -> dict[Hashable, Any]: 

83 # do the call to super first so that it can do all the other forms of 

84 # checking on this node. If you check the uniqueness of keys first 

85 # it would save the work that super does in the case of a failure, but 

86 # it might fail in the case that the node was the incorrect node due 

87 # to a parsing error, and the resulting exception would be difficult to 

88 # understand. 

89 mapping = super().construct_mapping(node, deep) 

90 # Check if there are any duplicate keys 

91 all_keys = Counter(key_node.value for key_node, _ in node.value) 

92 duplicates = {k for k, i in all_keys.items() if i != 1} 

93 if duplicates: 

94 raise KeyError( 

95 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

96 ) 

97 return mapping 

98 

99 

100class MultilineStringDumper(yaml.Dumper): 

101 """Custom YAML dumper that makes multi-line strings use the '|' 

102 continuation style instead of unreadable newlines and tons of quotes. 

103 

104 Basic approach is taken from 

105 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

106 but is written as a Dumper subclass to make its effects non-global (vs 

107 `yaml.add_representer`). 

108 """ 

109 

110 def represent_scalar(self, tag: str, value: Any, style: str | None = None) -> yaml.ScalarNode: 

111 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

112 style = "|" 

113 return super().represent_scalar(tag, value, style) 

114 

115 

116class ContractError(Exception): 

117 """An exception that is raised when a pipeline contract is not 

118 satisfied. 

119 """ 

120 

121 pass 

122 

123 

124@dataclass 

125class ContractIR: 

126 """Intermediate representation of configuration contracts read from a 

127 pipeline yaml file. 

128 """ 

129 

130 contract: str 

131 """A string of python code representing one or more conditions on configs 

132 in a pipeline. This code-as-string should, once evaluated, should be True 

133 if the configs are fine, and False otherwise. 

134 """ 

135 msg: str | None = None 

136 """An optional message to be shown to the user if a contract fails 

137 """ 

138 

139 def to_primitives(self) -> dict[str, str]: 

140 """Convert to a representation used in yaml serialization.""" 

141 accumulate = {"contract": self.contract} 

142 if self.msg is not None: 

143 accumulate["msg"] = self.msg 

144 return accumulate 

145 

146 def __eq__(self, other: object) -> bool: 

147 if not isinstance(other, ContractIR): 

148 return False 

149 return self.contract == other.contract and self.msg == other.msg 

150 

151 

152@dataclass 

153class LabeledSubset: 

154 """Intermediate representation of named subset of task labels read from 

155 a pipeline yaml file. 

156 """ 

157 

158 label: str 

159 """The label used to identify the subset of task labels. 

160 """ 

161 subset: set[str] 

162 """A set of task labels contained in this subset. 

163 """ 

164 description: str | None 

165 """A description of what this subset of tasks is intended to do 

166 """ 

167 

168 @staticmethod 

169 def from_primitives(label: str, value: list[str] | dict) -> LabeledSubset: 

170 """Generate `LabeledSubset` objects given a properly formatted object 

171 that as been created by a yaml loader. 

172 

173 Parameters 

174 ---------- 

175 label : `str` 

176 The label that will be used to identify this labeled subset. 

177 value : `list` of `str` or `dict` 

178 Object returned from loading a labeled subset section from a yaml 

179 document. 

180 

181 Returns 

182 ------- 

183 labeledSubset : `LabeledSubset` 

184 A `LabeledSubset` object build from the inputs. 

185 

186 Raises 

187 ------ 

188 ValueError 

189 Raised if the value input is not properly formatted for parsing 

190 """ 

191 if isinstance(value, MutableMapping): 

192 subset = value.pop("subset", None) 

193 if subset is None: 

194 raise ValueError( 

195 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

196 ) 

197 description = value.pop("description", None) 

198 elif isinstance(value, Iterable): 

199 subset = value 

200 description = None 

201 else: 

202 raise ValueError( 

203 f"There was a problem parsing the labeled subset {label}, make sure the " 

204 "definition is either a valid yaml list, or a mapping with keys " 

205 "(subset, description) where subset points to a yaml list, and description is " 

206 "associated with a string" 

207 ) 

208 return LabeledSubset(label, set(subset), description) 

209 

210 def to_primitives(self) -> dict[str, list[str] | str]: 

211 """Convert to a representation used in yaml serialization.""" 

212 accumulate: dict[str, list[str] | str] = {"subset": list(self.subset)} 

213 if self.description is not None: 

214 accumulate["description"] = self.description 

215 return accumulate 

216 

217 

218@dataclass 

219class ParametersIR: 

220 """Intermediate representation of parameters that are global to a pipeline. 

221 

222 Attributes 

223 ---------- 

224 mapping : `dict` [`str`, `str`] 

225 A mutable mapping of identifiers as keys, and shared configuration 

226 as values. 

227 

228 Notes 

229 ----- 

230 These parameters are specified under a top level key named ``parameters`` 

231 and are declared as a yaml mapping. These entries can then be used inside 

232 task configuration blocks to specify configuration values. They may not be 

233 used in the special ``file`` or ``python`` blocks. 

234 

235 Examples 

236 -------- 

237 .. code-block:: yaml 

238 

239 \u200bparameters: 

240 shared_value: 14 

241 tasks: 

242 taskA: 

243 class: modA 

244 config: 

245 field1: parameters.shared_value 

246 taskB: 

247 class: modB 

248 config: 

249 field2: parameters.shared_value 

250 """ 

251 

252 mapping: MutableMapping[str, Any] 

253 """A mutable mapping of identifiers as keys, and shared configuration 

254 as values. 

255 """ 

256 

257 def update(self, other: ParametersIR | None) -> None: 

258 if other is not None: 

259 self.mapping.update(other.mapping) 

260 

261 def to_primitives(self) -> MutableMapping[str, str]: 

262 """Convert to a representation used in yaml serialization.""" 

263 return self.mapping 

264 

265 def __contains__(self, value: str) -> bool: 

266 return value in self.mapping 

267 

268 def __getitem__(self, item: str) -> Any: 

269 return self.mapping[item] 

270 

271 def __bool__(self) -> bool: 

272 return bool(self.mapping) 

273 

274 

275@dataclass 

276class ConfigIR: 

277 """Intermediate representation of configurations read from a pipeline yaml 

278 file. 

279 """ 

280 

281 python: str | None = None 

282 """A string of python code that is used to modify a configuration. This can 

283 also be None if there are no modifications to do. 

284 """ 

285 dataId: dict | None = None 

286 """A dataId that is used to constrain these config overrides to only quanta 

287 with matching dataIds. This field can be None if there is no constraint. 

288 This is currently an unimplemented feature, and is placed here for future 

289 use. 

290 """ 

291 file: list[str] = field(default_factory=list) 

292 """A list of paths which points to a file containing config overrides to be 

293 applied. This value may be an empty list if there are no overrides to 

294 apply. 

295 """ 

296 rest: dict = field(default_factory=dict) 

297 """This is a dictionary of key value pairs, where the keys are strings 

298 corresponding to qualified fields on a config to override, and the values 

299 are strings representing the values to apply. 

300 """ 

301 

302 def to_primitives(self) -> dict[str, str | dict | list[str]]: 

303 """Convert to a representation used in yaml serialization.""" 

304 accumulate = {} 

305 for name in ("python", "dataId", "file"): 

306 # if this attribute is thruthy add it to the accumulation 

307 # dictionary 

308 if getattr(self, name): 

309 accumulate[name] = getattr(self, name) 

310 # Add the dictionary containing the rest of the config keys to the 

311 # # accumulated dictionary 

312 accumulate.update(self.rest) 

313 return accumulate 

314 

315 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

316 """Return a new ConfigIR object that is formatted according to the 

317 specified parameters. 

318 

319 Parameters 

320 ---------- 

321 parameters : `ParametersIR` 

322 Object that contains variable mappings used in substitution. 

323 

324 Returns 

325 ------- 

326 config : `ConfigIR` 

327 A new ConfigIR object formatted with the input parameters. 

328 """ 

329 new_config = copy.deepcopy(self) 

330 for key, value in new_config.rest.items(): 

331 if not isinstance(value, str): 

332 continue 

333 match = re.match("parameters[.](.*)", value) 

334 if match and match.group(1) in parameters: 

335 new_config.rest[key] = parameters[match.group(1)] 

336 if match and match.group(1) not in parameters: 

337 warnings.warn( 

338 f"config {key} contains value {match.group(0)} which is formatted like a " 

339 "Pipeline parameter but was not found within the Pipeline, if this was not " 

340 "intentional, check for a typo", 

341 stacklevel=find_outside_stacklevel("lsst.pipe.base"), 

342 ) 

343 return new_config 

344 

345 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

346 """Merge another instance of a `ConfigIR` into this instance if 

347 possible. This function returns a generator that is either self 

348 if the configs were merged, or self, and other_config if that could 

349 not be merged. 

350 

351 Parameters 

352 ---------- 

353 other_config : `ConfigIR` 

354 An instance of `ConfigIR` to merge into this instance. 

355 

356 Yields 

357 ------ 

358 Generator : `ConfigIR` 

359 A generator containing either self, or self and other_config if 

360 the configs could be merged or not respectively. 

361 """ 

362 # Verify that the config blocks can be merged 

363 if ( 

364 self.dataId != other_config.dataId 

365 or self.python 

366 or other_config.python 

367 or self.file 

368 or other_config.file 

369 ): 

370 yield from (self, other_config) 

371 return 

372 

373 # create a set of all keys, and verify two keys do not have different 

374 # values 

375 key_union = self.rest.keys() & other_config.rest.keys() 

376 for key in key_union: 

377 if self.rest[key] != other_config.rest[key]: 

378 yield from (self, other_config) 

379 return 

380 self.rest.update(other_config.rest) 

381 

382 # Combine the lists of override files to load 

383 self_file_set = set(self.file) 

384 other_file_set = set(other_config.file) 

385 self.file = list(self_file_set.union(other_file_set)) 

386 

387 yield self 

388 

389 def __eq__(self, other: object) -> bool: 

390 if not isinstance(other, ConfigIR): 

391 return False 

392 return all( 

393 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

394 ) 

395 

396 

397@dataclass 

398class TaskIR: 

399 """Intermediate representation of tasks read from a pipeline yaml file.""" 

400 

401 label: str 

402 """An identifier used to refer to a task. 

403 """ 

404 klass: str 

405 """A string containing a fully qualified python class to be run in a 

406 pipeline. 

407 """ 

408 config: list[ConfigIR] | None = None 

409 """list of all configs overrides associated with this task, and may be 

410 `None` if there are no config overrides. 

411 """ 

412 

413 def to_primitives(self) -> dict[str, str | list[dict]]: 

414 """Convert to a representation used in yaml serialization.""" 

415 accumulate: dict[str, str | list[dict]] = {"class": self.klass} 

416 if self.config: 

417 accumulate["config"] = [c.to_primitives() for c in self.config] 

418 return accumulate 

419 

420 def add_or_update_config(self, other_config: ConfigIR) -> None: 

421 """Add a `ConfigIR` to this task if one is not present. Merges configs 

422 if there is a `ConfigIR` present and the dataId keys of both configs 

423 match, otherwise adds a new entry to the config list. The exception to 

424 the above is that if either the last config or other_config has a 

425 python block, then other_config is always added, as python blocks can 

426 modify configs in ways that cannot be predicted. 

427 

428 Parameters 

429 ---------- 

430 other_config : `ConfigIR` 

431 A `ConfigIR` instance to add or merge into the config attribute of 

432 this task. 

433 """ 

434 if not self.config: 

435 self.config = [other_config] 

436 return 

437 self.config.extend(self.config.pop().maybe_merge(other_config)) 

438 

439 def __eq__(self, other: object) -> bool: 

440 if not isinstance(other, TaskIR): 

441 return False 

442 return all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")) 

443 

444 

445@dataclass 

446class ImportIR: 

447 """An intermediate representation of imported pipelines.""" 

448 

449 location: str 

450 """This is the location of the pipeline to inherit. The path should be 

451 specified as an absolute path. Environment variables may be used in the 

452 path and should be specified as a python string template, with the name of 

453 the environment variable inside braces. 

454 """ 

455 include: list[str] | None = None 

456 """list of tasks that should be included when inheriting this pipeline. 

457 Either the include or exclude attributes may be specified, but not both. 

458 """ 

459 exclude: list[str] | None = None 

460 """list of tasks that should be excluded when inheriting this pipeline. 

461 Either the include or exclude attributes may be specified, but not both. 

462 """ 

463 importContracts: bool = True 

464 """Boolean attribute to dictate if contracts should be inherited with the 

465 pipeline or not. 

466 """ 

467 labeledSubsetModifyMode: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP 

468 """Controls how labeled subsets are handled when an import ends up not 

469 including (either through an include or exclusion list) a task label that 

470 is defined in the `Pipeline` being imported. DROP will remove any 

471 subsets which contain a missing label. EDIT will change any subsets to not 

472 include the missing label. 

473 """ 

474 instrument: Literal[_Tags.KeepInstrument] | str | None = _Tags.KeepInstrument 

475 """Instrument to assign to the Pipeline at import. The default value of 

476 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

477 declared with will not be modified. setting this value to None will drop 

478 any declared instrument prior to import. 

479 """ 

480 

481 def toPipelineIR(self) -> "PipelineIR": 

482 """Load in the Pipeline specified by this object, and turn it into a 

483 PipelineIR instance. 

484 

485 Returns 

486 ------- 

487 pipeline : `PipelineIR` 

488 A pipeline generated from the imported pipeline file. 

489 """ 

490 if self.include and self.exclude: 

491 raise ValueError( 

492 "An include list and an exclude list cannot both be specified" 

493 " when declaring a pipeline import." 

494 ) 

495 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

496 if self.instrument is not _Tags.KeepInstrument: 

497 tmp_pipeline.instrument = self.instrument 

498 

499 included_labels = set() 

500 for label in tmp_pipeline.tasks: 

501 if ( 

502 (self.include and label in self.include) 

503 or (self.exclude and label not in self.exclude) 

504 or (self.include is None and self.exclude is None) 

505 ): 

506 included_labels.add(label) 

507 

508 # Handle labeled subsets being specified in the include or exclude 

509 # list, adding or removing labels. 

510 if self.include is not None: 

511 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

512 for label in subsets_in_include: 

513 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

514 

515 elif self.exclude is not None: 

516 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

517 for label in subsets_in_exclude: 

518 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

519 

520 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels, self.labeledSubsetModifyMode) 

521 

522 if not self.importContracts: 

523 tmp_pipeline.contracts = [] 

524 

525 return tmp_pipeline 

526 

527 def __eq__(self, other: object) -> bool: 

528 if not isinstance(other, ImportIR): 

529 return False 

530 return all( 

531 getattr(self, attr) == getattr(other, attr) 

532 for attr in ("location", "include", "exclude", "importContracts") 

533 ) 

534 

535 

536class PipelineIR: 

537 """Intermediate representation of a pipeline definition. 

538 

539 Parameters 

540 ---------- 

541 loaded_yaml : `dict` 

542 A dictionary which matches the structure that would be produced by a 

543 yaml reader which parses a pipeline definition document. 

544 

545 Raises 

546 ------ 

547 ValueError 

548 Raised if: 

549 

550 - a pipeline is declared without a description; 

551 - no tasks are declared in a pipeline, and no pipelines are to be 

552 inherited; 

553 - more than one instrument is specified; 

554 - more than one inherited pipeline share a label. 

555 """ 

556 

557 def __init__(self, loaded_yaml: dict[str, Any]): 

558 # Check required fields are present 

559 if "description" not in loaded_yaml: 

560 raise ValueError("A pipeline must be declared with a description") 

561 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

562 raise ValueError("A pipeline must be declared with one or more tasks") 

563 

564 # These steps below must happen in this call order 

565 

566 # Process pipeline description 

567 self.description = loaded_yaml.pop("description") 

568 

569 # Process tasks 

570 self._read_tasks(loaded_yaml) 

571 

572 # Process instrument keys 

573 inst = loaded_yaml.pop("instrument", None) 

574 if isinstance(inst, list): 

575 raise ValueError("Only one top level instrument can be defined in a pipeline") 

576 self.instrument: str | None = inst 

577 

578 # Process any contracts 

579 self._read_contracts(loaded_yaml) 

580 

581 # Process any defined parameters 

582 self._read_parameters(loaded_yaml) 

583 

584 # Process any named label subsets 

585 self._read_labeled_subsets(loaded_yaml) 

586 

587 # Process any inherited pipelines 

588 self._read_imports(loaded_yaml) 

589 

590 # verify named subsets, must be done after inheriting 

591 self._verify_labeled_subsets() 

592 

593 def _read_contracts(self, loaded_yaml: dict[str, Any]) -> None: 

594 """Process the contracts portion of the loaded yaml document 

595 

596 Parameters 

597 ---------- 

598 loaded_yaml : `dict` 

599 A dictionary which matches the structure that would be produced by 

600 a yaml reader which parses a pipeline definition document 

601 """ 

602 loaded_contracts = loaded_yaml.pop("contracts", []) 

603 if isinstance(loaded_contracts, str): 

604 loaded_contracts = [loaded_contracts] 

605 self.contracts: list[ContractIR] = [] 

606 for contract in loaded_contracts: 

607 if isinstance(contract, dict): 

608 self.contracts.append(ContractIR(**contract)) 

609 if isinstance(contract, str): 

610 self.contracts.append(ContractIR(contract=contract)) 

611 

612 def _read_parameters(self, loaded_yaml: dict[str, Any]) -> None: 

613 """Process the parameters portion of the loaded yaml document 

614 

615 Parameters 

616 ---------- 

617 loaded_yaml : `dict` 

618 A dictionary which matches the structure that would be produced by 

619 a yaml reader which parses a pipeline definition document 

620 """ 

621 loaded_parameters = loaded_yaml.pop("parameters", {}) 

622 if not isinstance(loaded_parameters, dict): 

623 raise ValueError("The parameters section must be a yaml mapping") 

624 self.parameters = ParametersIR(loaded_parameters) 

625 

626 def _read_labeled_subsets(self, loaded_yaml: dict[str, Any]) -> None: 

627 """Process the subsets portion of the loaded yaml document 

628 

629 Parameters 

630 ---------- 

631 loaded_yaml : `MutableMapping` 

632 A dictionary which matches the structure that would be produced 

633 by a yaml reader which parses a pipeline definition document 

634 """ 

635 loaded_subsets = loaded_yaml.pop("subsets", {}) 

636 self.labeled_subsets: dict[str, LabeledSubset] = {} 

637 if not loaded_subsets and "subset" in loaded_yaml: 

638 raise ValueError("Top level key should be subsets and not subset, add an s") 

639 for key, value in loaded_subsets.items(): 

640 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

641 

642 def _verify_labeled_subsets(self) -> None: 

643 """Verify that all the labels in each named subset exist within the 

644 pipeline. 

645 """ 

646 # Verify that all labels defined in a labeled subset are in the 

647 # Pipeline 

648 for labeled_subset in self.labeled_subsets.values(): 

649 if not labeled_subset.subset.issubset(self.tasks.keys()): 

650 raise ValueError( 

651 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

652 "declared pipeline" 

653 ) 

654 # Verify subset labels are not already task labels 

655 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

656 if label_intersection: 

657 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

658 

659 def _read_imports(self, loaded_yaml: dict[str, Any]) -> None: 

660 """Process the inherits portion of the loaded yaml document 

661 

662 Parameters 

663 ---------- 

664 loaded_yaml : `dict` 

665 A dictionary which matches the structure that would be produced by 

666 a yaml reader which parses a pipeline definition document 

667 """ 

668 

669 def process_args(argument: str | dict) -> dict: 

670 if isinstance(argument, str): 

671 return {"location": argument} 

672 elif isinstance(argument, dict): 

673 if "exclude" in argument and isinstance(argument["exclude"], str): 

674 argument["exclude"] = [argument["exclude"]] 

675 if "include" in argument and isinstance(argument["include"], str): 

676 argument["include"] = [argument["include"]] 

677 if "instrument" in argument and argument["instrument"] == "None": 

678 argument["instrument"] = None 

679 if "labeledSubsetModifyMode" in argument: 

680 match argument["labeledSubsetModifyMode"]: 

681 case "DROP": 

682 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.DROP 

683 case "EDIT": 

684 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.EDIT 

685 case unknown: 

686 raise ValueError(f"{unknown} is not a valid mode for labeledSubsetModifyMode") 

687 return argument 

688 

689 if not {"inherits", "imports"} - loaded_yaml.keys(): 

690 raise ValueError("Cannot define both inherits and imports sections, use imports") 

691 tmp_import = loaded_yaml.pop("inherits", None) 

692 if tmp_import is None: 

693 tmp_import = loaded_yaml.pop("imports", None) 

694 else: 

695 raise ValueError("The 'inherits' key is not supported. Please use the key 'imports' instead") 

696 if tmp_import is None: 

697 self.imports: list[ImportIR] = [] 

698 elif isinstance(tmp_import, list): 

699 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

700 else: 

701 self.imports = [ImportIR(**process_args(tmp_import))] 

702 

703 self.merge_pipelines([fragment.toPipelineIR() for fragment in self.imports]) 

704 

705 def merge_pipelines(self, pipelines: Iterable[PipelineIR]) -> None: 

706 """Merge one or more other `PipelineIR` objects into this object. 

707 

708 Parameters 

709 ---------- 

710 pipelines : `~collections.abc.Iterable` of `PipelineIR` objects 

711 An `~collections.abc.Iterable` that contains one or more 

712 `PipelineIR` objects to merge into this object. 

713 

714 Raises 

715 ------ 

716 ValueError 

717 Raised if there is a conflict in instrument specifications. 

718 Raised if a task label appears in more than one of the input 

719 `PipelineIR` objects which are to be merged. 

720 Raised if a labeled subset appears in more than one of the input 

721 `PipelineIR` objects which are to be merged, and with any subset 

722 existing in this object. 

723 """ 

724 # integrate any imported pipelines 

725 accumulate_tasks: dict[str, TaskIR] = {} 

726 accumulate_labeled_subsets: dict[str, LabeledSubset] = {} 

727 accumulated_parameters = ParametersIR({}) 

728 

729 for tmp_IR in pipelines: 

730 if self.instrument is None: 

731 self.instrument = tmp_IR.instrument 

732 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

733 msg = ( 

734 "Only one instrument can be declared in a pipeline or its imports. " 

735 f"Top level pipeline defines {self.instrument} but pipeline to merge " 

736 f"defines {tmp_IR.instrument}." 

737 ) 

738 raise ValueError(msg) 

739 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

740 msg = ( 

741 "Task labels in the imported pipelines must be unique. " 

742 f"These labels appear multiple times: {duplicate_labels}" 

743 ) 

744 raise ValueError(msg) 

745 accumulate_tasks.update(tmp_IR.tasks) 

746 self.contracts.extend(tmp_IR.contracts) 

747 # verify that tmp_IR has unique labels for named subset among 

748 # existing labeled subsets, and with existing task labels. 

749 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

750 task_subset_overlap = ( 

751 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

752 ) & accumulate_tasks.keys() 

753 if overlapping_subsets or task_subset_overlap: 

754 raise ValueError( 

755 "Labeled subset names must be unique amongst imports in both labels and " 

756 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

757 ) 

758 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

759 accumulated_parameters.update(tmp_IR.parameters) 

760 

761 # verify that any accumulated labeled subsets dont clash with a label 

762 # from this pipeline 

763 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

764 raise ValueError( 

765 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

766 ) 

767 # merge in the named subsets for self so this document can override any 

768 # that have been delcared 

769 accumulate_labeled_subsets.update(self.labeled_subsets) 

770 self.labeled_subsets = accumulate_labeled_subsets 

771 

772 # merge the dict of label:TaskIR objects, preserving any configs in the 

773 # imported pipeline if the labels point to the same class 

774 for label, task in self.tasks.items(): 

775 if label not in accumulate_tasks: 

776 accumulate_tasks[label] = task 

777 elif accumulate_tasks[label].klass == task.klass: 

778 if task.config is not None: 

779 for config in task.config: 

780 accumulate_tasks[label].add_or_update_config(config) 

781 else: 

782 accumulate_tasks[label] = task 

783 self.tasks: dict[str, TaskIR] = accumulate_tasks 

784 accumulated_parameters.update(self.parameters) 

785 self.parameters = accumulated_parameters 

786 

787 def _read_tasks(self, loaded_yaml: dict[str, Any]) -> None: 

788 """Process the tasks portion of the loaded yaml document 

789 

790 Parameters 

791 ---------- 

792 loaded_yaml : `dict` 

793 A dictionary which matches the structure that would be produced by 

794 a yaml reader which parses a pipeline definition document 

795 """ 

796 self.tasks = {} 

797 tmp_tasks = loaded_yaml.pop("tasks", None) 

798 if tmp_tasks is None: 

799 tmp_tasks = {} 

800 

801 if "parameters" in tmp_tasks: 

802 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

803 

804 for label, definition in tmp_tasks.items(): 

805 if isinstance(definition, str): 

806 definition = {"class": definition} 

807 config = definition.get("config", None) 

808 if config is None: 

809 task_config_ir = None 

810 else: 

811 if isinstance(config, dict): 

812 config = [config] 

813 task_config_ir = [] 

814 for c in config: 

815 file = c.pop("file", None) 

816 if file is None: 

817 file = [] 

818 elif not isinstance(file, list): 

819 file = [file] 

820 task_config_ir.append( 

821 ConfigIR( 

822 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

823 ) 

824 ) 

825 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

826 

827 def _remove_contracts(self, label: str) -> None: 

828 """Remove any contracts that contain the given label 

829 

830 String comparison used in this way is not the most elegant and may 

831 have issues, but it is the only feasible way when users can specify 

832 contracts with generic strings. 

833 """ 

834 new_contracts = [] 

835 for contract in self.contracts: 

836 # match a label that is not preceded by an ASCII identifier, or 

837 # is the start of a line and is followed by a dot 

838 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

839 continue 

840 new_contracts.append(contract) 

841 self.contracts = new_contracts 

842 

843 def subset_from_labels( 

844 self, labelSpecifier: set[str], subsetCtrl: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP 

845 ) -> PipelineIR: 

846 """Subset a pipelineIR to contain only labels specified in 

847 labelSpecifier. 

848 

849 Parameters 

850 ---------- 

851 labelSpecifier : `set` of `str` 

852 Set containing labels that describes how to subset a pipeline. 

853 subsetCtrl : `PipelineSubsetCtrl` 

854 Control object which decides how subsets with missing labels are 

855 handled. Setting to `PipelineSubsetCtrl.DROP` (the default) will 

856 cause any subsets that have labels which are not in the set of all 

857 task labels to be dropped. Setting to `PipelineSubsetCtrl.EDIT` 

858 will cause the subset to instead be edited to remove the 

859 nonexistent label. 

860 

861 Returns 

862 ------- 

863 pipeline : `PipelineIR` 

864 A new pipelineIR object that is a subset of the old pipelineIR. 

865 

866 Raises 

867 ------ 

868 ValueError 

869 Raised if there is an issue with specified labels. 

870 

871 Notes 

872 ----- 

873 This method attempts to prune any contracts that contain labels which 

874 are not in the declared subset of labels. This pruning is done using a 

875 string based matching due to the nature of contracts and may prune more 

876 than it should. 

877 """ 

878 pipeline = copy.deepcopy(self) 

879 

880 # update the label specifier to expand any named subsets 

881 toRemove = set() 

882 toAdd = set() 

883 for label in labelSpecifier: 

884 if label in pipeline.labeled_subsets: 

885 toRemove.add(label) 

886 toAdd.update(pipeline.labeled_subsets[label].subset) 

887 labelSpecifier.difference_update(toRemove) 

888 labelSpecifier.update(toAdd) 

889 # verify all the labels are in the pipeline 

890 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

891 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

892 raise ValueError( 

893 "Not all supplied labels (specified or named subsets) are in the pipeline " 

894 f"definition, extra labels: {difference}" 

895 ) 

896 # copy needed so as to not modify while iterating 

897 pipeline_labels = set(pipeline.tasks.keys()) 

898 # Remove the labels from the pipelineIR, and any contracts that contain 

899 # those labels (see docstring on _remove_contracts for why this may 

900 # cause issues) 

901 for label in pipeline_labels: 

902 if label not in labelSpecifier: 

903 pipeline.tasks.pop(label) 

904 pipeline._remove_contracts(label) 

905 

906 # create a copy of the object to iterate over 

907 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

908 # remove any labeled subsets that no longer have a complete set 

909 for label, labeled_subset in labeled_subsets.items(): 

910 if extraTaskLabels := (labeled_subset.subset - pipeline.tasks.keys()): 

911 match subsetCtrl: 

912 case PipelineSubsetCtrl.DROP: 

913 pipeline.labeled_subsets.pop(label) 

914 case PipelineSubsetCtrl.EDIT: 

915 for extra in extraTaskLabels: 

916 labeled_subset.subset.discard(extra) 

917 

918 return pipeline 

919 

920 @classmethod 

921 def from_string(cls, pipeline_string: str) -> PipelineIR: 

922 """Create a `PipelineIR` object from a string formatted like a pipeline 

923 document. 

924 

925 Parameters 

926 ---------- 

927 pipeline_string : `str` 

928 A string that is formatted according like a pipeline document. 

929 """ 

930 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

931 return cls(loaded_yaml) 

932 

933 @classmethod 

934 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

935 """Create a `PipelineIR` object from the document specified by the 

936 input uri. 

937 

938 Parameters 

939 ---------- 

940 uri : convertible to `~lsst.resources.ResourcePath` 

941 Location of document to use in creating a `PipelineIR` object. 

942 

943 Returns 

944 ------- 

945 pipelineIR : `PipelineIR` 

946 The loaded pipeline. 

947 """ 

948 loaded_uri = ResourcePath(uri) 

949 with loaded_uri.open("r") as buffer: 

950 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader) 

951 return cls(loaded_yaml) 

952 

953 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

954 """Serialize this `PipelineIR` object into a yaml formatted string and 

955 write the output to a file at the specified uri. 

956 

957 Parameters 

958 ---------- 

959 uri : convertible to `~lsst.resources.ResourcePath` 

960 Location of document to write a `PipelineIR` object. 

961 """ 

962 with ResourcePath(uri).open("w") as buffer: 

963 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

964 

965 def to_primitives(self) -> dict[str, Any]: 

966 """Convert to a representation used in yaml serialization. 

967 

968 Returns 

969 ------- 

970 primitives : `dict` 

971 Dictionary that maps directly to the serialized YAML form. 

972 """ 

973 accumulate = {"description": self.description} 

974 if self.instrument is not None: 

975 accumulate["instrument"] = self.instrument 

976 if self.parameters: 

977 accumulate["parameters"] = self.parameters.to_primitives() 

978 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

979 if len(self.contracts) > 0: 

980 # sort contracts lexicographical order by the contract string in 

981 # absence of any other ordering principle 

982 contracts_list = [c.to_primitives() for c in self.contracts] 

983 contracts_list.sort(key=lambda x: x["contract"]) 

984 accumulate["contracts"] = contracts_list 

985 if self.labeled_subsets: 

986 accumulate["subsets"] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

987 return accumulate 

988 

989 def __str__(self) -> str: 

990 """Instance formatting as how it would look in yaml representation""" 

991 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

992 

993 def __repr__(self) -> str: 

994 """Instance formatting as how it would look in yaml representation""" 

995 return str(self) 

996 

997 def __eq__(self, other: object) -> bool: 

998 if not isinstance(other, PipelineIR): 

999 return False 

1000 # special case contracts because it is a list, but order is not 

1001 # important 

1002 return ( 

1003 all( 

1004 getattr(self, attr) == getattr(other, attr) 

1005 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

1006 ) 

1007 and len(self.contracts) == len(other.contracts) 

1008 and all(c in self.contracts for c in other.contracts) 

1009 )