Coverage for python/lsst/pipe/base/pipelineIR.py: 21%

407 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-06 10:56 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ( 

30 "ConfigIR", 

31 "ContractError", 

32 "ContractIR", 

33 "ImportIR", 

34 "LabeledSubset", 

35 "ParametersIR", 

36 "PipelineIR", 

37 "TaskIR", 

38) 

39 

40import copy 

41import enum 

42import os 

43import re 

44import warnings 

45from collections import Counter 

46from collections.abc import Generator, Hashable, Iterable, MutableMapping 

47from dataclasses import dataclass, field 

48from typing import Any, Literal 

49 

50import yaml 

51from lsst.resources import ResourcePath, ResourcePathExpression 

52from lsst.utils.introspection import find_outside_stacklevel 

53 

54 

55class PipelineSubsetCtrl(enum.Enum): 

56 """An Enumeration of the various ways a pipeline subsetting operation will 

57 handle labeled subsets when task labels they defined are missing. 

58 """ 

59 

60 DROP = enum.auto() 

61 """Drop any subsets that contain labels which are no longer in the set of 

62 task labels when subsetting an entire pipeline 

63 """ 

64 EDIT = enum.auto() 

65 """Edit any subsets that contain labels which are no longer in the set of 

66 task labels to remove the missing label, but leave the subset when 

67 subsetting a pipeline. 

68 """ 

69 

70 

71class _Tags(enum.Enum): 

72 KeepInstrument = enum.auto() 

73 

74 

75class PipelineYamlLoader(yaml.SafeLoader): 

76 """Specialized version of yaml's SafeLoader. 

77 

78 It checks and raises an exception if it finds that there are multiple 

79 instances of the same key found inside a pipeline file at a given scope. 

80 """ 

81 

82 def construct_mapping(self, node: yaml.MappingNode, deep: bool = False) -> dict[Hashable, Any]: 

83 # do the call to super first so that it can do all the other forms of 

84 # checking on this node. If you check the uniqueness of keys first 

85 # it would save the work that super does in the case of a failure, but 

86 # it might fail in the case that the node was the incorrect node due 

87 # to a parsing error, and the resulting exception would be difficult to 

88 # understand. 

89 mapping = super().construct_mapping(node, deep) 

90 # Check if there are any duplicate keys 

91 all_keys = Counter(key_node.value for key_node, _ in node.value) 

92 duplicates = {k for k, i in all_keys.items() if i != 1} 

93 if duplicates: 

94 raise KeyError( 

95 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

96 ) 

97 return mapping 

98 

99 

100class MultilineStringDumper(yaml.Dumper): 

101 """Custom YAML dumper that makes multi-line strings use the '|' 

102 continuation style instead of unreadable newlines and tons of quotes. 

103 

104 Basic approach is taken from 

105 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

106 but is written as a Dumper subclass to make its effects non-global (vs 

107 `yaml.add_representer`). 

108 """ 

109 

110 def represent_scalar(self, tag: str, value: Any, style: str | None = None) -> yaml.ScalarNode: 

111 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

112 style = "|" 

113 return super().represent_scalar(tag, value, style) 

114 

115 

116class ContractError(Exception): 

117 """An exception that is raised when a pipeline contract is not satisfied""" 

118 

119 pass 

120 

121 

122@dataclass 

123class ContractIR: 

124 """Intermediate representation of configuration contracts read from a 

125 pipeline yaml file. 

126 """ 

127 

128 contract: str 

129 """A string of python code representing one or more conditions on configs 

130 in a pipeline. This code-as-string should, once evaluated, should be True 

131 if the configs are fine, and False otherwise. 

132 """ 

133 msg: str | None = None 

134 """An optional message to be shown to the user if a contract fails 

135 """ 

136 

137 def to_primitives(self) -> dict[str, str]: 

138 """Convert to a representation used in yaml serialization""" 

139 accumulate = {"contract": self.contract} 

140 if self.msg is not None: 

141 accumulate["msg"] = self.msg 

142 return accumulate 

143 

144 def __eq__(self, other: object) -> bool: 

145 if not isinstance(other, ContractIR): 

146 return False 

147 return self.contract == other.contract and self.msg == other.msg 

148 

149 

150@dataclass 

151class LabeledSubset: 

152 """Intermediate representation of named subset of task labels read from 

153 a pipeline yaml file. 

154 """ 

155 

156 label: str 

157 """The label used to identify the subset of task labels. 

158 """ 

159 subset: set[str] 

160 """A set of task labels contained in this subset. 

161 """ 

162 description: str | None 

163 """A description of what this subset of tasks is intended to do 

164 """ 

165 

166 @staticmethod 

167 def from_primitives(label: str, value: list[str] | dict) -> LabeledSubset: 

168 """Generate `LabeledSubset` objects given a properly formatted object 

169 that as been created by a yaml loader. 

170 

171 Parameters 

172 ---------- 

173 label : `str` 

174 The label that will be used to identify this labeled subset. 

175 value : `list` of `str` or `dict` 

176 Object returned from loading a labeled subset section from a yaml 

177 document. 

178 

179 Returns 

180 ------- 

181 labeledSubset : `LabeledSubset` 

182 A `LabeledSubset` object build from the inputs. 

183 

184 Raises 

185 ------ 

186 ValueError 

187 Raised if the value input is not properly formatted for parsing 

188 """ 

189 if isinstance(value, MutableMapping): 

190 subset = value.pop("subset", None) 

191 if subset is None: 

192 raise ValueError( 

193 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

194 ) 

195 description = value.pop("description", None) 

196 elif isinstance(value, Iterable): 

197 subset = value 

198 description = None 

199 else: 

200 raise ValueError( 

201 f"There was a problem parsing the labeled subset {label}, make sure the " 

202 "definition is either a valid yaml list, or a mapping with keys " 

203 "(subset, description) where subset points to a yaml list, and description is " 

204 "associated with a string" 

205 ) 

206 return LabeledSubset(label, set(subset), description) 

207 

208 def to_primitives(self) -> dict[str, list[str] | str]: 

209 """Convert to a representation used in yaml serialization.""" 

210 accumulate: dict[str, list[str] | str] = {"subset": list(self.subset)} 

211 if self.description is not None: 

212 accumulate["description"] = self.description 

213 return accumulate 

214 

215 

216@dataclass 

217class ParametersIR: 

218 """Intermediate representation of parameters that are global to a pipeline. 

219 

220 Parameters 

221 ---------- 

222 mapping : `dict` [`str`, `str`] 

223 A mutable mapping of identifiers as keys, and shared configuration 

224 as values. 

225 

226 Notes 

227 ----- 

228 These parameters are specified under a top level key named ``parameters`` 

229 and are declared as a yaml mapping. These entries can then be used inside 

230 task configuration blocks to specify configuration values. They may not be 

231 used in the special ``file`` or ``python`` blocks. 

232 

233 Examples 

234 -------- 

235 .. code-block:: yaml 

236 

237 \u200bparameters: 

238 shared_value: 14 

239 tasks: 

240 taskA: 

241 class: modA 

242 config: 

243 field1: parameters.shared_value 

244 taskB: 

245 class: modB 

246 config: 

247 field2: parameters.shared_value 

248 """ 

249 

250 mapping: MutableMapping[str, Any] 

251 """A mutable mapping of identifiers as keys, and shared configuration 

252 as values. 

253 """ 

254 

255 def update(self, other: ParametersIR | None) -> None: 

256 if other is not None: 

257 self.mapping.update(other.mapping) 

258 

259 def to_primitives(self) -> MutableMapping[str, str]: 

260 """Convert to a representation used in yaml serialization""" 

261 return self.mapping 

262 

263 def __contains__(self, value: str) -> bool: 

264 return value in self.mapping 

265 

266 def __getitem__(self, item: str) -> Any: 

267 return self.mapping[item] 

268 

269 def __bool__(self) -> bool: 

270 return bool(self.mapping) 

271 

272 

273@dataclass 

274class ConfigIR: 

275 """Intermediate representation of configurations read from a pipeline yaml 

276 file. 

277 """ 

278 

279 python: str | None = None 

280 """A string of python code that is used to modify a configuration. This can 

281 also be None if there are no modifications to do. 

282 """ 

283 dataId: dict | None = None 

284 """A dataId that is used to constrain these config overrides to only quanta 

285 with matching dataIds. This field can be None if there is no constraint. 

286 This is currently an unimplemented feature, and is placed here for future 

287 use. 

288 """ 

289 file: list[str] = field(default_factory=list) 

290 """A list of paths which points to a file containing config overrides to be 

291 applied. This value may be an empty list if there are no overrides to 

292 apply. 

293 """ 

294 rest: dict = field(default_factory=dict) 

295 """This is a dictionary of key value pairs, where the keys are strings 

296 corresponding to qualified fields on a config to override, and the values 

297 are strings representing the values to apply. 

298 """ 

299 

300 def to_primitives(self) -> dict[str, str | dict | list[str]]: 

301 """Convert to a representation used in yaml serialization""" 

302 accumulate = {} 

303 for name in ("python", "dataId", "file"): 

304 # if this attribute is thruthy add it to the accumulation 

305 # dictionary 

306 if getattr(self, name): 

307 accumulate[name] = getattr(self, name) 

308 # Add the dictionary containing the rest of the config keys to the 

309 # # accumulated dictionary 

310 accumulate.update(self.rest) 

311 return accumulate 

312 

313 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

314 """Return a new ConfigIR object that is formatted according to the 

315 specified parameters 

316 

317 Parameters 

318 ---------- 

319 parameters : `ParametersIR` 

320 Object that contains variable mappings used in substitution. 

321 

322 Returns 

323 ------- 

324 config : `ConfigIR` 

325 A new ConfigIR object formatted with the input parameters 

326 """ 

327 new_config = copy.deepcopy(self) 

328 for key, value in new_config.rest.items(): 

329 if not isinstance(value, str): 

330 continue 

331 match = re.match("parameters[.](.*)", value) 

332 if match and match.group(1) in parameters: 

333 new_config.rest[key] = parameters[match.group(1)] 

334 if match and match.group(1) not in parameters: 

335 warnings.warn( 

336 f"config {key} contains value {match.group(0)} which is formatted like a " 

337 "Pipeline parameter but was not found within the Pipeline, if this was not " 

338 "intentional, check for a typo", 

339 stacklevel=find_outside_stacklevel("lsst.pipe.base"), 

340 ) 

341 return new_config 

342 

343 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

344 """Merge another instance of a `ConfigIR` into this instance if 

345 possible. This function returns a generator that is either self 

346 if the configs were merged, or self, and other_config if that could 

347 not be merged. 

348 

349 Parameters 

350 ---------- 

351 other_config : `ConfigIR` 

352 An instance of `ConfigIR` to merge into this instance. 

353 

354 Returns 

355 ------- 

356 Generator : `ConfigIR` 

357 A generator containing either self, or self and other_config if 

358 the configs could be merged or not respectively. 

359 """ 

360 # Verify that the config blocks can be merged 

361 if ( 

362 self.dataId != other_config.dataId 

363 or self.python 

364 or other_config.python 

365 or self.file 

366 or other_config.file 

367 ): 

368 yield from (self, other_config) 

369 return 

370 

371 # create a set of all keys, and verify two keys do not have different 

372 # values 

373 key_union = self.rest.keys() & other_config.rest.keys() 

374 for key in key_union: 

375 if self.rest[key] != other_config.rest[key]: 

376 yield from (self, other_config) 

377 return 

378 self.rest.update(other_config.rest) 

379 

380 # Combine the lists of override files to load 

381 self_file_set = set(self.file) 

382 other_file_set = set(other_config.file) 

383 self.file = list(self_file_set.union(other_file_set)) 

384 

385 yield self 

386 

387 def __eq__(self, other: object) -> bool: 

388 if not isinstance(other, ConfigIR): 

389 return False 

390 return all( 

391 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

392 ) 

393 

394 

395@dataclass 

396class TaskIR: 

397 """Intermediate representation of tasks read from a pipeline yaml file.""" 

398 

399 label: str 

400 """An identifier used to refer to a task. 

401 """ 

402 klass: str 

403 """A string containing a fully qualified python class to be run in a 

404 pipeline. 

405 """ 

406 config: list[ConfigIR] | None = None 

407 """list of all configs overrides associated with this task, and may be 

408 `None` if there are no config overrides. 

409 """ 

410 

411 def to_primitives(self) -> dict[str, str | list[dict]]: 

412 """Convert to a representation used in yaml serialization""" 

413 accumulate: dict[str, str | list[dict]] = {"class": self.klass} 

414 if self.config: 

415 accumulate["config"] = [c.to_primitives() for c in self.config] 

416 return accumulate 

417 

418 def add_or_update_config(self, other_config: ConfigIR) -> None: 

419 """Add a `ConfigIR` to this task if one is not present. Merges configs 

420 if there is a `ConfigIR` present and the dataId keys of both configs 

421 match, otherwise adds a new entry to the config list. The exception to 

422 the above is that if either the last config or other_config has a 

423 python block, then other_config is always added, as python blocks can 

424 modify configs in ways that cannot be predicted. 

425 

426 Parameters 

427 ---------- 

428 other_config : `ConfigIR` 

429 A `ConfigIR` instance to add or merge into the config attribute of 

430 this task. 

431 """ 

432 if not self.config: 

433 self.config = [other_config] 

434 return 

435 self.config.extend(self.config.pop().maybe_merge(other_config)) 

436 

437 def __eq__(self, other: object) -> bool: 

438 if not isinstance(other, TaskIR): 

439 return False 

440 return all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")) 

441 

442 

443@dataclass 

444class ImportIR: 

445 """An intermediate representation of imported pipelines""" 

446 

447 location: str 

448 """This is the location of the pipeline to inherit. The path should be 

449 specified as an absolute path. Environment variables may be used in the 

450 path and should be specified as a python string template, with the name of 

451 the environment variable inside braces. 

452 """ 

453 include: list[str] | None = None 

454 """list of tasks that should be included when inheriting this pipeline. 

455 Either the include or exclude attributes may be specified, but not both. 

456 """ 

457 exclude: list[str] | None = None 

458 """list of tasks that should be excluded when inheriting this pipeline. 

459 Either the include or exclude attributes may be specified, but not both. 

460 """ 

461 importContracts: bool = True 

462 """Boolean attribute to dictate if contracts should be inherited with the 

463 pipeline or not. 

464 """ 

465 labeledSubsetModifyMode: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP 

466 """Controls how labeled subsets are handled when an import ends up not 

467 including (either through an include or exclusion list) a task label that 

468 is defined in the `Pipeline` being imported. DROP will remove any 

469 subsets which contain a missing label. EDIT will change any subsets to not 

470 include the missing label. 

471 """ 

472 instrument: Literal[_Tags.KeepInstrument] | str | None = _Tags.KeepInstrument 

473 """Instrument to assign to the Pipeline at import. The default value of 

474 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

475 declared with will not be modified. setting this value to None will drop 

476 any declared instrument prior to import. 

477 """ 

478 

479 def toPipelineIR(self) -> "PipelineIR": 

480 """Load in the Pipeline specified by this object, and turn it into a 

481 PipelineIR instance. 

482 

483 Returns 

484 ------- 

485 pipeline : `PipelineIR` 

486 A pipeline generated from the imported pipeline file 

487 """ 

488 if self.include and self.exclude: 

489 raise ValueError( 

490 "An include list and an exclude list cannot both be specified" 

491 " when declaring a pipeline import." 

492 ) 

493 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

494 if self.instrument is not _Tags.KeepInstrument: 

495 tmp_pipeline.instrument = self.instrument 

496 

497 included_labels = set() 

498 for label in tmp_pipeline.tasks: 

499 if ( 

500 (self.include and label in self.include) 

501 or (self.exclude and label not in self.exclude) 

502 or (self.include is None and self.exclude is None) 

503 ): 

504 included_labels.add(label) 

505 

506 # Handle labeled subsets being specified in the include or exclude 

507 # list, adding or removing labels. 

508 if self.include is not None: 

509 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

510 for label in subsets_in_include: 

511 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

512 

513 elif self.exclude is not None: 

514 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

515 for label in subsets_in_exclude: 

516 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

517 

518 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels, self.labeledSubsetModifyMode) 

519 

520 if not self.importContracts: 

521 tmp_pipeline.contracts = [] 

522 

523 return tmp_pipeline 

524 

525 def __eq__(self, other: object) -> bool: 

526 if not isinstance(other, ImportIR): 

527 return False 

528 return all( 

529 getattr(self, attr) == getattr(other, attr) 

530 for attr in ("location", "include", "exclude", "importContracts") 

531 ) 

532 

533 

534class PipelineIR: 

535 """Intermediate representation of a pipeline definition 

536 

537 Parameters 

538 ---------- 

539 loaded_yaml : `dict` 

540 A dictionary which matches the structure that would be produced by a 

541 yaml reader which parses a pipeline definition document 

542 

543 Raises 

544 ------ 

545 ValueError 

546 Raised if: 

547 

548 - a pipeline is declared without a description; 

549 - no tasks are declared in a pipeline, and no pipelines are to be 

550 inherited; 

551 - more than one instrument is specified; 

552 - more than one inherited pipeline share a label. 

553 """ 

554 

555 def __init__(self, loaded_yaml: dict[str, Any]): 

556 # Check required fields are present 

557 if "description" not in loaded_yaml: 

558 raise ValueError("A pipeline must be declared with a description") 

559 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

560 raise ValueError("A pipeline must be declared with one or more tasks") 

561 

562 # These steps below must happen in this call order 

563 

564 # Process pipeline description 

565 self.description = loaded_yaml.pop("description") 

566 

567 # Process tasks 

568 self._read_tasks(loaded_yaml) 

569 

570 # Process instrument keys 

571 inst = loaded_yaml.pop("instrument", None) 

572 if isinstance(inst, list): 

573 raise ValueError("Only one top level instrument can be defined in a pipeline") 

574 self.instrument: str | None = inst 

575 

576 # Process any contracts 

577 self._read_contracts(loaded_yaml) 

578 

579 # Process any defined parameters 

580 self._read_parameters(loaded_yaml) 

581 

582 # Process any named label subsets 

583 self._read_labeled_subsets(loaded_yaml) 

584 

585 # Process any inherited pipelines 

586 self._read_imports(loaded_yaml) 

587 

588 # verify named subsets, must be done after inheriting 

589 self._verify_labeled_subsets() 

590 

591 def _read_contracts(self, loaded_yaml: dict[str, Any]) -> None: 

592 """Process the contracts portion of the loaded yaml document 

593 

594 Parameters 

595 ---------- 

596 loaded_yaml : `dict` 

597 A dictionary which matches the structure that would be produced by 

598 a yaml reader which parses a pipeline definition document 

599 """ 

600 loaded_contracts = loaded_yaml.pop("contracts", []) 

601 if isinstance(loaded_contracts, str): 

602 loaded_contracts = [loaded_contracts] 

603 self.contracts: list[ContractIR] = [] 

604 for contract in loaded_contracts: 

605 if isinstance(contract, dict): 

606 self.contracts.append(ContractIR(**contract)) 

607 if isinstance(contract, str): 

608 self.contracts.append(ContractIR(contract=contract)) 

609 

610 def _read_parameters(self, loaded_yaml: dict[str, Any]) -> None: 

611 """Process the parameters portion of the loaded yaml document 

612 

613 Parameters 

614 ---------- 

615 loaded_yaml : `dict` 

616 A dictionary which matches the structure that would be produced by 

617 a yaml reader which parses a pipeline definition document 

618 """ 

619 loaded_parameters = loaded_yaml.pop("parameters", {}) 

620 if not isinstance(loaded_parameters, dict): 

621 raise ValueError("The parameters section must be a yaml mapping") 

622 self.parameters = ParametersIR(loaded_parameters) 

623 

624 def _read_labeled_subsets(self, loaded_yaml: dict[str, Any]) -> None: 

625 """Process the subsets portion of the loaded yaml document 

626 

627 Parameters 

628 ---------- 

629 loaded_yaml: `MutableMapping` 

630 A dictionary which matches the structure that would be produced 

631 by a yaml reader which parses a pipeline definition document 

632 """ 

633 loaded_subsets = loaded_yaml.pop("subsets", {}) 

634 self.labeled_subsets: dict[str, LabeledSubset] = {} 

635 if not loaded_subsets and "subset" in loaded_yaml: 

636 raise ValueError("Top level key should be subsets and not subset, add an s") 

637 for key, value in loaded_subsets.items(): 

638 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

639 

640 def _verify_labeled_subsets(self) -> None: 

641 """Verify that all the labels in each named subset exist within the 

642 pipeline. 

643 """ 

644 # Verify that all labels defined in a labeled subset are in the 

645 # Pipeline 

646 for labeled_subset in self.labeled_subsets.values(): 

647 if not labeled_subset.subset.issubset(self.tasks.keys()): 

648 raise ValueError( 

649 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

650 "declared pipeline" 

651 ) 

652 # Verify subset labels are not already task labels 

653 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

654 if label_intersection: 

655 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

656 

657 def _read_imports(self, loaded_yaml: dict[str, Any]) -> None: 

658 """Process the inherits portion of the loaded yaml document 

659 

660 Parameters 

661 ---------- 

662 loaded_yaml : `dict` 

663 A dictionary which matches the structure that would be produced by 

664 a yaml reader which parses a pipeline definition document 

665 """ 

666 

667 def process_args(argument: str | dict) -> dict: 

668 if isinstance(argument, str): 

669 return {"location": argument} 

670 elif isinstance(argument, dict): 

671 if "exclude" in argument and isinstance(argument["exclude"], str): 

672 argument["exclude"] = [argument["exclude"]] 

673 if "include" in argument and isinstance(argument["include"], str): 

674 argument["include"] = [argument["include"]] 

675 if "instrument" in argument and argument["instrument"] == "None": 

676 argument["instrument"] = None 

677 if "labeledSubsetModifyMode" in argument: 

678 match argument["labeledSubsetModifyMode"]: 

679 case "DROP": 

680 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.DROP 

681 case "EDIT": 

682 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.EDIT 

683 case unknown: 

684 raise ValueError(f"{unknown} is not a valid mode for labeledSubsetModifyMode") 

685 return argument 

686 

687 if not {"inherits", "imports"} - loaded_yaml.keys(): 

688 raise ValueError("Cannot define both inherits and imports sections, use imports") 

689 tmp_import = loaded_yaml.pop("inherits", None) 

690 if tmp_import is None: 

691 tmp_import = loaded_yaml.pop("imports", None) 

692 else: 

693 raise ValueError("The 'inherits' key is not supported. Please use the key 'imports' instead") 

694 if tmp_import is None: 

695 self.imports: list[ImportIR] = [] 

696 elif isinstance(tmp_import, list): 

697 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

698 else: 

699 self.imports = [ImportIR(**process_args(tmp_import))] 

700 

701 self.merge_pipelines([fragment.toPipelineIR() for fragment in self.imports]) 

702 

703 def merge_pipelines(self, pipelines: Iterable[PipelineIR]) -> None: 

704 """Merge one or more other `PipelineIR` objects into this object. 

705 

706 Parameters 

707 ---------- 

708 pipelines : `~collections.abc.Iterable` of `PipelineIR` objects 

709 An `~collections.abc.Iterable` that contains one or more 

710 `PipelineIR` objects to merge into this object. 

711 

712 Raises 

713 ------ 

714 ValueError 

715 Raised if there is a conflict in instrument specifications. 

716 Raised if a task label appears in more than one of the input 

717 `PipelineIR` objects which are to be merged. 

718 Raised if a labeled subset appears in more than one of the input 

719 `PipelineIR` objects which are to be merged, and with any subset 

720 existing in this object. 

721 """ 

722 # integrate any imported pipelines 

723 accumulate_tasks: dict[str, TaskIR] = {} 

724 accumulate_labeled_subsets: dict[str, LabeledSubset] = {} 

725 accumulated_parameters = ParametersIR({}) 

726 

727 for tmp_IR in pipelines: 

728 if self.instrument is None: 

729 self.instrument = tmp_IR.instrument 

730 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

731 msg = ( 

732 "Only one instrument can be declared in a pipeline or its imports. " 

733 f"Top level pipeline defines {self.instrument} but pipeline to merge " 

734 f"defines {tmp_IR.instrument}." 

735 ) 

736 raise ValueError(msg) 

737 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

738 msg = ( 

739 "Task labels in the imported pipelines must be unique. " 

740 f"These labels appear multiple times: {duplicate_labels}" 

741 ) 

742 raise ValueError(msg) 

743 accumulate_tasks.update(tmp_IR.tasks) 

744 self.contracts.extend(tmp_IR.contracts) 

745 # verify that tmp_IR has unique labels for named subset among 

746 # existing labeled subsets, and with existing task labels. 

747 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

748 task_subset_overlap = ( 

749 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

750 ) & accumulate_tasks.keys() 

751 if overlapping_subsets or task_subset_overlap: 

752 raise ValueError( 

753 "Labeled subset names must be unique amongst imports in both labels and " 

754 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

755 ) 

756 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

757 accumulated_parameters.update(tmp_IR.parameters) 

758 

759 # verify that any accumulated labeled subsets dont clash with a label 

760 # from this pipeline 

761 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

762 raise ValueError( 

763 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

764 ) 

765 # merge in the named subsets for self so this document can override any 

766 # that have been delcared 

767 accumulate_labeled_subsets.update(self.labeled_subsets) 

768 self.labeled_subsets = accumulate_labeled_subsets 

769 

770 # merge the dict of label:TaskIR objects, preserving any configs in the 

771 # imported pipeline if the labels point to the same class 

772 for label, task in self.tasks.items(): 

773 if label not in accumulate_tasks: 

774 accumulate_tasks[label] = task 

775 elif accumulate_tasks[label].klass == task.klass: 

776 if task.config is not None: 

777 for config in task.config: 

778 accumulate_tasks[label].add_or_update_config(config) 

779 else: 

780 accumulate_tasks[label] = task 

781 self.tasks: dict[str, TaskIR] = accumulate_tasks 

782 accumulated_parameters.update(self.parameters) 

783 self.parameters = accumulated_parameters 

784 

785 def _read_tasks(self, loaded_yaml: dict[str, Any]) -> None: 

786 """Process the tasks portion of the loaded yaml document 

787 

788 Parameters 

789 ---------- 

790 loaded_yaml : `dict` 

791 A dictionary which matches the structure that would be produced by 

792 a yaml reader which parses a pipeline definition document 

793 """ 

794 self.tasks = {} 

795 tmp_tasks = loaded_yaml.pop("tasks", None) 

796 if tmp_tasks is None: 

797 tmp_tasks = {} 

798 

799 if "parameters" in tmp_tasks: 

800 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

801 

802 for label, definition in tmp_tasks.items(): 

803 if isinstance(definition, str): 

804 definition = {"class": definition} 

805 config = definition.get("config", None) 

806 if config is None: 

807 task_config_ir = None 

808 else: 

809 if isinstance(config, dict): 

810 config = [config] 

811 task_config_ir = [] 

812 for c in config: 

813 file = c.pop("file", None) 

814 if file is None: 

815 file = [] 

816 elif not isinstance(file, list): 

817 file = [file] 

818 task_config_ir.append( 

819 ConfigIR( 

820 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

821 ) 

822 ) 

823 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

824 

825 def _remove_contracts(self, label: str) -> None: 

826 """Remove any contracts that contain the given label 

827 

828 String comparison used in this way is not the most elegant and may 

829 have issues, but it is the only feasible way when users can specify 

830 contracts with generic strings. 

831 """ 

832 new_contracts = [] 

833 for contract in self.contracts: 

834 # match a label that is not preceded by an ASCII identifier, or 

835 # is the start of a line and is followed by a dot 

836 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

837 continue 

838 new_contracts.append(contract) 

839 self.contracts = new_contracts 

840 

841 def subset_from_labels( 

842 self, labelSpecifier: set[str], subsetCtrl: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP 

843 ) -> PipelineIR: 

844 """Subset a pipelineIR to contain only labels specified in 

845 labelSpecifier. 

846 

847 Parameters 

848 ---------- 

849 labelSpecifier : `set` of `str` 

850 Set containing labels that describes how to subset a pipeline. 

851 subsetCtrl : `PipelineSubsetCtrl` 

852 Control object which decides how subsets with missing labels are 

853 handled. Setting to `PipelineSubsetCtrl.DROP` (the default) will 

854 cause any subsets that have labels which are not in the set of all 

855 task labels to be dropped. Setting to `PipelineSubsetCtrl.EDIT` 

856 will cause the subset to instead be edited to remove the 

857 nonexistent label. 

858 

859 Returns 

860 ------- 

861 pipeline : `PipelineIR` 

862 A new pipelineIR object that is a subset of the old pipelineIR 

863 

864 Raises 

865 ------ 

866 ValueError 

867 Raised if there is an issue with specified labels 

868 

869 Notes 

870 ----- 

871 This method attempts to prune any contracts that contain labels which 

872 are not in the declared subset of labels. This pruning is done using a 

873 string based matching due to the nature of contracts and may prune more 

874 than it should. 

875 """ 

876 pipeline = copy.deepcopy(self) 

877 

878 # update the label specifier to expand any named subsets 

879 toRemove = set() 

880 toAdd = set() 

881 for label in labelSpecifier: 

882 if label in pipeline.labeled_subsets: 

883 toRemove.add(label) 

884 toAdd.update(pipeline.labeled_subsets[label].subset) 

885 labelSpecifier.difference_update(toRemove) 

886 labelSpecifier.update(toAdd) 

887 # verify all the labels are in the pipeline 

888 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

889 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

890 raise ValueError( 

891 "Not all supplied labels (specified or named subsets) are in the pipeline " 

892 f"definition, extra labels: {difference}" 

893 ) 

894 # copy needed so as to not modify while iterating 

895 pipeline_labels = set(pipeline.tasks.keys()) 

896 # Remove the labels from the pipelineIR, and any contracts that contain 

897 # those labels (see docstring on _remove_contracts for why this may 

898 # cause issues) 

899 for label in pipeline_labels: 

900 if label not in labelSpecifier: 

901 pipeline.tasks.pop(label) 

902 pipeline._remove_contracts(label) 

903 

904 # create a copy of the object to iterate over 

905 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

906 # remove any labeled subsets that no longer have a complete set 

907 for label, labeled_subset in labeled_subsets.items(): 

908 if extraTaskLabels := (labeled_subset.subset - pipeline.tasks.keys()): 

909 match subsetCtrl: 

910 case PipelineSubsetCtrl.DROP: 

911 pipeline.labeled_subsets.pop(label) 

912 case PipelineSubsetCtrl.EDIT: 

913 for extra in extraTaskLabels: 

914 labeled_subset.subset.discard(extra) 

915 

916 return pipeline 

917 

918 @classmethod 

919 def from_string(cls, pipeline_string: str) -> PipelineIR: 

920 """Create a `PipelineIR` object from a string formatted like a pipeline 

921 document 

922 

923 Parameters 

924 ---------- 

925 pipeline_string : `str` 

926 A string that is formatted according like a pipeline document 

927 """ 

928 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

929 return cls(loaded_yaml) 

930 

931 @classmethod 

932 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

933 """Create a `PipelineIR` object from the document specified by the 

934 input uri. 

935 

936 Parameters 

937 ---------- 

938 uri: convertible to `~lsst.resources.ResourcePath` 

939 Location of document to use in creating a `PipelineIR` object. 

940 

941 Returns 

942 ------- 

943 pipelineIR : `PipelineIR` 

944 The loaded pipeline 

945 """ 

946 loaded_uri = ResourcePath(uri) 

947 with loaded_uri.open("r") as buffer: 

948 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader) 

949 return cls(loaded_yaml) 

950 

951 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

952 """Serialize this `PipelineIR` object into a yaml formatted string and 

953 write the output to a file at the specified uri. 

954 

955 Parameters 

956 ---------- 

957 uri: convertible to `~lsst.resources.ResourcePath` 

958 Location of document to write a `PipelineIR` object. 

959 """ 

960 with ResourcePath(uri).open("w") as buffer: 

961 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

962 

963 def to_primitives(self) -> dict[str, Any]: 

964 """Convert to a representation used in yaml serialization 

965 

966 Returns 

967 ------- 

968 primitives : `dict` 

969 dictionary that maps directly to the serialized YAML form. 

970 """ 

971 accumulate = {"description": self.description} 

972 if self.instrument is not None: 

973 accumulate["instrument"] = self.instrument 

974 if self.parameters: 

975 accumulate["parameters"] = self.parameters.to_primitives() 

976 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

977 if len(self.contracts) > 0: 

978 # sort contracts lexicographical order by the contract string in 

979 # absence of any other ordering principle 

980 contracts_list = [c.to_primitives() for c in self.contracts] 

981 contracts_list.sort(key=lambda x: x["contract"]) 

982 accumulate["contracts"] = contracts_list 

983 if self.labeled_subsets: 

984 accumulate["subsets"] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

985 return accumulate 

986 

987 def __str__(self) -> str: 

988 """Instance formatting as how it would look in yaml representation""" 

989 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

990 

991 def __repr__(self) -> str: 

992 """Instance formatting as how it would look in yaml representation""" 

993 return str(self) 

994 

995 def __eq__(self, other: object) -> bool: 

996 if not isinstance(other, PipelineIR): 

997 return False 

998 # special case contracts because it is a list, but order is not 

999 # important 

1000 return ( 

1001 all( 

1002 getattr(self, attr) == getattr(other, attr) 

1003 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

1004 ) 

1005 and len(self.contracts) == len(other.contracts) 

1006 and all(c in self.contracts for c in other.contracts) 

1007 )