Coverage for python/lsst/pipe/base/pipelineIR.py: 19%

396 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 11:14 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ConfigIR", 

25 "ContractError", 

26 "ContractIR", 

27 "ImportIR", 

28 "LabeledSubset", 

29 "ParametersIR", 

30 "PipelineIR", 

31 "TaskIR", 

32) 

33 

34import copy 

35import enum 

36import os 

37import re 

38import warnings 

39from collections import Counter 

40from collections.abc import Generator, Hashable, Iterable, MutableMapping 

41from dataclasses import dataclass, field 

42from typing import Any, Literal 

43 

44import yaml 

45from lsst.resources import ResourcePath, ResourcePathExpression 

46 

47 

48class _Tags(enum.Enum): 

49 KeepInstrument = enum.auto() 

50 

51 

52class PipelineYamlLoader(yaml.SafeLoader): 

53 """Specialized version of yaml's SafeLoader. 

54 

55 It checks and raises an exception if it finds that there are multiple 

56 instances of the same key found inside a pipeline file at a given scope. 

57 """ 

58 

59 def construct_mapping(self, node: yaml.MappingNode, deep: bool = False) -> dict[Hashable, Any]: 

60 # do the call to super first so that it can do all the other forms of 

61 # checking on this node. If you check the uniqueness of keys first 

62 # it would save the work that super does in the case of a failure, but 

63 # it might fail in the case that the node was the incorrect node due 

64 # to a parsing error, and the resulting exception would be difficult to 

65 # understand. 

66 mapping = super().construct_mapping(node, deep) 

67 # Check if there are any duplicate keys 

68 all_keys = Counter(key_node.value for key_node, _ in node.value) 

69 duplicates = {k for k, i in all_keys.items() if i != 1} 

70 if duplicates: 

71 raise KeyError( 

72 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

73 ) 

74 return mapping 

75 

76 

77class MultilineStringDumper(yaml.Dumper): 

78 """Custom YAML dumper that makes multi-line strings use the '|' 

79 continuation style instead of unreadable newlines and tons of quotes. 

80 

81 Basic approach is taken from 

82 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

83 but is written as a Dumper subclass to make its effects non-global (vs 

84 `yaml.add_representer`). 

85 """ 

86 

87 def represent_scalar(self, tag: str, value: Any, style: str | None = None) -> yaml.ScalarNode: 

88 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

89 style = "|" 

90 return super().represent_scalar(tag, value, style) 

91 

92 

93class ContractError(Exception): 

94 """An exception that is raised when a pipeline contract is not satisfied""" 

95 

96 pass 

97 

98 

99@dataclass 

100class ContractIR: 

101 """Intermediate representation of configuration contracts read from a 

102 pipeline yaml file. 

103 """ 

104 

105 contract: str 

106 """A string of python code representing one or more conditions on configs 

107 in a pipeline. This code-as-string should, once evaluated, should be True 

108 if the configs are fine, and False otherwise. 

109 """ 

110 msg: str | None = None 

111 """An optional message to be shown to the user if a contract fails 

112 """ 

113 

114 def to_primitives(self) -> dict[str, str]: 

115 """Convert to a representation used in yaml serialization""" 

116 accumulate = {"contract": self.contract} 

117 if self.msg is not None: 

118 accumulate["msg"] = self.msg 

119 return accumulate 

120 

121 def __eq__(self, other: object) -> bool: 

122 if not isinstance(other, ContractIR): 

123 return False 

124 elif self.contract == other.contract and self.msg == other.msg: 

125 return True 

126 else: 

127 return False 

128 

129 

130@dataclass 

131class LabeledSubset: 

132 """Intermediate representation of named subset of task labels read from 

133 a pipeline yaml file. 

134 """ 

135 

136 label: str 

137 """The label used to identify the subset of task labels. 

138 """ 

139 subset: set[str] 

140 """A set of task labels contained in this subset. 

141 """ 

142 description: str | None 

143 """A description of what this subset of tasks is intended to do 

144 """ 

145 

146 @staticmethod 

147 def from_primitives(label: str, value: list[str] | dict) -> LabeledSubset: 

148 """Generate `LabeledSubset` objects given a properly formatted object 

149 that as been created by a yaml loader. 

150 

151 Parameters 

152 ---------- 

153 label : `str` 

154 The label that will be used to identify this labeled subset. 

155 value : `list` of `str` or `dict` 

156 Object returned from loading a labeled subset section from a yaml 

157 document. 

158 

159 Returns 

160 ------- 

161 labeledSubset : `LabeledSubset` 

162 A `LabeledSubset` object build from the inputs. 

163 

164 Raises 

165 ------ 

166 ValueError 

167 Raised if the value input is not properly formatted for parsing 

168 """ 

169 if isinstance(value, MutableMapping): 

170 subset = value.pop("subset", None) 

171 if subset is None: 

172 raise ValueError( 

173 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

174 ) 

175 description = value.pop("description", None) 

176 elif isinstance(value, Iterable): 

177 subset = value 

178 description = None 

179 else: 

180 raise ValueError( 

181 f"There was a problem parsing the labeled subset {label}, make sure the " 

182 "definition is either a valid yaml list, or a mapping with keys " 

183 "(subset, description) where subset points to a yaml list, and description is " 

184 "associated with a string" 

185 ) 

186 return LabeledSubset(label, set(subset), description) 

187 

188 def to_primitives(self) -> dict[str, list[str] | str]: 

189 """Convert to a representation used in yaml serialization.""" 

190 accumulate: dict[str, list[str] | str] = {"subset": list(self.subset)} 

191 if self.description is not None: 

192 accumulate["description"] = self.description 

193 return accumulate 

194 

195 

196@dataclass 

197class ParametersIR: 

198 """Intermediate representation of parameters that are global to a pipeline. 

199 

200 Parameters 

201 ---------- 

202 mapping : `dict` [`str`, `str`] 

203 A mutable mapping of identifiers as keys, and shared configuration 

204 as values. 

205 

206 Notes 

207 ----- 

208 These parameters are specified under a top level key named ``parameters`` 

209 and are declared as a yaml mapping. These entries can then be used inside 

210 task configuration blocks to specify configuration values. They may not be 

211 used in the special ``file`` or ``python`` blocks. 

212 

213 Examples 

214 -------- 

215 .. code-block:: yaml 

216 

217 \u200bparameters: 

218 shared_value: 14 

219 tasks: 

220 taskA: 

221 class: modA 

222 config: 

223 field1: parameters.shared_value 

224 taskB: 

225 class: modB 

226 config: 

227 field2: parameters.shared_value 

228 """ 

229 

230 mapping: MutableMapping[str, str] 

231 """A mutable mapping of identifiers as keys, and shared configuration 

232 as values. 

233 """ 

234 

235 def update(self, other: ParametersIR | None) -> None: 

236 if other is not None: 

237 self.mapping.update(other.mapping) 

238 

239 def to_primitives(self) -> MutableMapping[str, str]: 

240 """Convert to a representation used in yaml serialization""" 

241 return self.mapping 

242 

243 def __contains__(self, value: str) -> bool: 

244 return value in self.mapping 

245 

246 def __getitem__(self, item: str) -> Any: 

247 return self.mapping[item] 

248 

249 def __bool__(self) -> bool: 

250 return bool(self.mapping) 

251 

252 

253@dataclass 

254class ConfigIR: 

255 """Intermediate representation of configurations read from a pipeline yaml 

256 file. 

257 """ 

258 

259 python: str | None = None 

260 """A string of python code that is used to modify a configuration. This can 

261 also be None if there are no modifications to do. 

262 """ 

263 dataId: dict | None = None 

264 """A dataId that is used to constrain these config overrides to only quanta 

265 with matching dataIds. This field can be None if there is no constraint. 

266 This is currently an unimplemented feature, and is placed here for future 

267 use. 

268 """ 

269 file: list[str] = field(default_factory=list) 

270 """A list of paths which points to a file containing config overrides to be 

271 applied. This value may be an empty list if there are no overrides to 

272 apply. 

273 """ 

274 rest: dict = field(default_factory=dict) 

275 """This is a dictionary of key value pairs, where the keys are strings 

276 corresponding to qualified fields on a config to override, and the values 

277 are strings representing the values to apply. 

278 """ 

279 

280 def to_primitives(self) -> dict[str, str | dict | list[str]]: 

281 """Convert to a representation used in yaml serialization""" 

282 accumulate = {} 

283 for name in ("python", "dataId", "file"): 

284 # if this attribute is thruthy add it to the accumulation 

285 # dictionary 

286 if getattr(self, name): 

287 accumulate[name] = getattr(self, name) 

288 # Add the dictionary containing the rest of the config keys to the 

289 # # accumulated dictionary 

290 accumulate.update(self.rest) 

291 return accumulate 

292 

293 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

294 """Return a new ConfigIR object that is formatted according to the 

295 specified parameters 

296 

297 Parameters 

298 ---------- 

299 parameters : `ParametersIR` 

300 Object that contains variable mappings used in substitution. 

301 

302 Returns 

303 ------- 

304 config : `ConfigIR` 

305 A new ConfigIR object formatted with the input parameters 

306 """ 

307 new_config = copy.deepcopy(self) 

308 for key, value in new_config.rest.items(): 

309 if not isinstance(value, str): 

310 continue 

311 match = re.match("parameters[.](.*)", value) 

312 if match and match.group(1) in parameters: 

313 new_config.rest[key] = parameters[match.group(1)] 

314 if match and match.group(1) not in parameters: 

315 warnings.warn( 

316 f"config {key} contains value {match.group(0)} which is formatted like a " 

317 "Pipeline parameter but was not found within the Pipeline, if this was not " 

318 "intentional, check for a typo" 

319 ) 

320 return new_config 

321 

322 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

323 """Merge another instance of a `ConfigIR` into this instance if 

324 possible. This function returns a generator that is either self 

325 if the configs were merged, or self, and other_config if that could 

326 not be merged. 

327 

328 Parameters 

329 ---------- 

330 other_config : `ConfigIR` 

331 An instance of `ConfigIR` to merge into this instance. 

332 

333 Returns 

334 ------- 

335 Generator : `ConfigIR` 

336 A generator containing either self, or self and other_config if 

337 the configs could be merged or not respectively. 

338 """ 

339 # Verify that the config blocks can be merged 

340 if ( 

341 self.dataId != other_config.dataId 

342 or self.python 

343 or other_config.python 

344 or self.file 

345 or other_config.file 

346 ): 

347 yield from (self, other_config) 

348 return 

349 

350 # create a set of all keys, and verify two keys do not have different 

351 # values 

352 key_union = self.rest.keys() & other_config.rest.keys() 

353 for key in key_union: 

354 if self.rest[key] != other_config.rest[key]: 

355 yield from (self, other_config) 

356 return 

357 self.rest.update(other_config.rest) 

358 

359 # Combine the lists of override files to load 

360 self_file_set = set(self.file) 

361 other_file_set = set(other_config.file) 

362 self.file = list(self_file_set.union(other_file_set)) 

363 

364 yield self 

365 

366 def __eq__(self, other: object) -> bool: 

367 if not isinstance(other, ConfigIR): 

368 return False 

369 elif all( 

370 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

371 ): 

372 return True 

373 else: 

374 return False 

375 

376 

377@dataclass 

378class TaskIR: 

379 """Intermediate representation of tasks read from a pipeline yaml file.""" 

380 

381 label: str 

382 """An identifier used to refer to a task. 

383 """ 

384 klass: str 

385 """A string containing a fully qualified python class to be run in a 

386 pipeline. 

387 """ 

388 config: list[ConfigIR] | None = None 

389 """list of all configs overrides associated with this task, and may be 

390 `None` if there are no config overrides. 

391 """ 

392 

393 def to_primitives(self) -> dict[str, str | list[dict]]: 

394 """Convert to a representation used in yaml serialization""" 

395 accumulate: dict[str, str | list[dict]] = {"class": self.klass} 

396 if self.config: 

397 accumulate["config"] = [c.to_primitives() for c in self.config] 

398 return accumulate 

399 

400 def add_or_update_config(self, other_config: ConfigIR) -> None: 

401 """Add a `ConfigIR` to this task if one is not present. Merges configs 

402 if there is a `ConfigIR` present and the dataId keys of both configs 

403 match, otherwise adds a new entry to the config list. The exception to 

404 the above is that if either the last config or other_config has a 

405 python block, then other_config is always added, as python blocks can 

406 modify configs in ways that cannot be predicted. 

407 

408 Parameters 

409 ---------- 

410 other_config : `ConfigIR` 

411 A `ConfigIR` instance to add or merge into the config attribute of 

412 this task. 

413 """ 

414 if not self.config: 

415 self.config = [other_config] 

416 return 

417 self.config.extend(self.config.pop().maybe_merge(other_config)) 

418 

419 def __eq__(self, other: object) -> bool: 

420 if not isinstance(other, TaskIR): 

421 return False 

422 elif all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")): 

423 return True 

424 else: 

425 return False 

426 

427 

428@dataclass 

429class ImportIR: 

430 """An intermediate representation of imported pipelines""" 

431 

432 location: str 

433 """This is the location of the pipeline to inherit. The path should be 

434 specified as an absolute path. Environment variables may be used in the 

435 path and should be specified as a python string template, with the name of 

436 the environment variable inside braces. 

437 """ 

438 include: list[str] | None = None 

439 """list of tasks that should be included when inheriting this pipeline. 

440 Either the include or exclude attributes may be specified, but not both. 

441 """ 

442 exclude: list[str] | None = None 

443 """list of tasks that should be excluded when inheriting this pipeline. 

444 Either the include or exclude attributes may be specified, but not both. 

445 """ 

446 importContracts: bool = True 

447 """Boolean attribute to dictate if contracts should be inherited with the 

448 pipeline or not. 

449 """ 

450 instrument: Literal[_Tags.KeepInstrument] | str | None = _Tags.KeepInstrument 

451 """Instrument to assign to the Pipeline at import. The default value of 

452 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

453 declared with will not be modified. setting this value to None will drop 

454 any declared instrument prior to import. 

455 """ 

456 

457 def toPipelineIR(self) -> "PipelineIR": 

458 """Load in the Pipeline specified by this object, and turn it into a 

459 PipelineIR instance. 

460 

461 Returns 

462 ------- 

463 pipeline : `PipelineIR` 

464 A pipeline generated from the imported pipeline file 

465 """ 

466 if self.include and self.exclude: 

467 raise ValueError( 

468 "An include list and an exclude list cannot both be specified" 

469 " when declaring a pipeline import." 

470 ) 

471 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

472 if self.instrument is not _Tags.KeepInstrument: 

473 tmp_pipeline.instrument = self.instrument 

474 

475 included_labels = set() 

476 for label in tmp_pipeline.tasks: 

477 if ( 

478 (self.include and label in self.include) 

479 or (self.exclude and label not in self.exclude) 

480 or (self.include is None and self.exclude is None) 

481 ): 

482 included_labels.add(label) 

483 

484 # Handle labeled subsets being specified in the include or exclude 

485 # list, adding or removing labels. 

486 if self.include is not None: 

487 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

488 for label in subsets_in_include: 

489 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

490 

491 elif self.exclude is not None: 

492 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

493 for label in subsets_in_exclude: 

494 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

495 

496 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels) 

497 

498 if not self.importContracts: 

499 tmp_pipeline.contracts = [] 

500 

501 return tmp_pipeline 

502 

503 def __eq__(self, other: object) -> bool: 

504 if not isinstance(other, ImportIR): 

505 return False 

506 elif all( 

507 getattr(self, attr) == getattr(other, attr) 

508 for attr in ("location", "include", "exclude", "importContracts") 

509 ): 

510 return True 

511 else: 

512 return False 

513 

514 

515class PipelineIR: 

516 """Intermediate representation of a pipeline definition 

517 

518 Parameters 

519 ---------- 

520 loaded_yaml : `dict` 

521 A dictionary which matches the structure that would be produced by a 

522 yaml reader which parses a pipeline definition document 

523 

524 Raises 

525 ------ 

526 ValueError 

527 Raised if: 

528 

529 - a pipeline is declared without a description; 

530 - no tasks are declared in a pipeline, and no pipelines are to be 

531 inherited; 

532 - more than one instrument is specified; 

533 - more than one inherited pipeline share a label. 

534 """ 

535 

536 def __init__(self, loaded_yaml: dict[str, Any]): 

537 # Check required fields are present 

538 if "description" not in loaded_yaml: 

539 raise ValueError("A pipeline must be declared with a description") 

540 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

541 raise ValueError("A pipeline must be declared with one or more tasks") 

542 

543 # These steps below must happen in this call order 

544 

545 # Process pipeline description 

546 self.description = loaded_yaml.pop("description") 

547 

548 # Process tasks 

549 self._read_tasks(loaded_yaml) 

550 

551 # Process instrument keys 

552 inst = loaded_yaml.pop("instrument", None) 

553 if isinstance(inst, list): 

554 raise ValueError("Only one top level instrument can be defined in a pipeline") 

555 self.instrument: str | None = inst 

556 

557 # Process any contracts 

558 self._read_contracts(loaded_yaml) 

559 

560 # Process any defined parameters 

561 self._read_parameters(loaded_yaml) 

562 

563 # Process any named label subsets 

564 self._read_labeled_subsets(loaded_yaml) 

565 

566 # Process any inherited pipelines 

567 self._read_imports(loaded_yaml) 

568 

569 # verify named subsets, must be done after inheriting 

570 self._verify_labeled_subsets() 

571 

572 def _read_contracts(self, loaded_yaml: dict[str, Any]) -> None: 

573 """Process the contracts portion of the loaded yaml document 

574 

575 Parameters 

576 ---------- 

577 loaded_yaml : `dict` 

578 A dictionary which matches the structure that would be produced by 

579 a yaml reader which parses a pipeline definition document 

580 """ 

581 loaded_contracts = loaded_yaml.pop("contracts", []) 

582 if isinstance(loaded_contracts, str): 

583 loaded_contracts = [loaded_contracts] 

584 self.contracts: list[ContractIR] = [] 

585 for contract in loaded_contracts: 

586 if isinstance(contract, dict): 

587 self.contracts.append(ContractIR(**contract)) 

588 if isinstance(contract, str): 

589 self.contracts.append(ContractIR(contract=contract)) 

590 

591 def _read_parameters(self, loaded_yaml: dict[str, Any]) -> None: 

592 """Process the parameters portion of the loaded yaml document 

593 

594 Parameters 

595 ---------- 

596 loaded_yaml : `dict` 

597 A dictionary which matches the structure that would be produced by 

598 a yaml reader which parses a pipeline definition document 

599 """ 

600 loaded_parameters = loaded_yaml.pop("parameters", {}) 

601 if not isinstance(loaded_parameters, dict): 

602 raise ValueError("The parameters section must be a yaml mapping") 

603 self.parameters = ParametersIR(loaded_parameters) 

604 

605 def _read_labeled_subsets(self, loaded_yaml: dict[str, Any]) -> None: 

606 """Process the subsets portion of the loaded yaml document 

607 

608 Parameters 

609 ---------- 

610 loaded_yaml: `MutableMapping` 

611 A dictionary which matches the structure that would be produced 

612 by a yaml reader which parses a pipeline definition document 

613 """ 

614 loaded_subsets = loaded_yaml.pop("subsets", {}) 

615 self.labeled_subsets: dict[str, LabeledSubset] = {} 

616 if not loaded_subsets and "subset" in loaded_yaml: 

617 raise ValueError("Top level key should be subsets and not subset, add an s") 

618 for key, value in loaded_subsets.items(): 

619 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

620 

621 def _verify_labeled_subsets(self) -> None: 

622 """Verify that all the labels in each named subset exist within the 

623 pipeline. 

624 """ 

625 # Verify that all labels defined in a labeled subset are in the 

626 # Pipeline 

627 for labeled_subset in self.labeled_subsets.values(): 

628 if not labeled_subset.subset.issubset(self.tasks.keys()): 

629 raise ValueError( 

630 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

631 "declared pipeline" 

632 ) 

633 # Verify subset labels are not already task labels 

634 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

635 if label_intersection: 

636 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

637 

638 def _read_imports(self, loaded_yaml: dict[str, Any]) -> None: 

639 """Process the inherits portion of the loaded yaml document 

640 

641 Parameters 

642 ---------- 

643 loaded_yaml : `dict` 

644 A dictionary which matches the structure that would be produced by 

645 a yaml reader which parses a pipeline definition document 

646 """ 

647 

648 def process_args(argument: str | dict) -> dict: 

649 if isinstance(argument, str): 

650 return {"location": argument} 

651 elif isinstance(argument, dict): 

652 if "exclude" in argument and isinstance(argument["exclude"], str): 

653 argument["exclude"] = [argument["exclude"]] 

654 if "include" in argument and isinstance(argument["include"], str): 

655 argument["include"] = [argument["include"]] 

656 if "instrument" in argument and argument["instrument"] == "None": 

657 argument["instrument"] = None 

658 return argument 

659 

660 if not {"inherits", "imports"} - loaded_yaml.keys(): 

661 raise ValueError("Cannot define both inherits and imports sections, use imports") 

662 tmp_import = loaded_yaml.pop("inherits", None) 

663 if tmp_import is None: 

664 tmp_import = loaded_yaml.pop("imports", None) 

665 else: 

666 raise ValueError("The 'inherits' key is not supported. Please use the key 'imports' instead") 

667 if tmp_import is None: 

668 self.imports: list[ImportIR] = [] 

669 elif isinstance(tmp_import, list): 

670 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

671 else: 

672 self.imports = [ImportIR(**process_args(tmp_import))] 

673 

674 self.merge_pipelines([fragment.toPipelineIR() for fragment in self.imports]) 

675 

676 def merge_pipelines(self, pipelines: Iterable[PipelineIR]) -> None: 

677 """Merge one or more other `PipelineIR` objects into this object. 

678 

679 Parameters 

680 ---------- 

681 pipelines : `~collections.abc.Iterable` of `PipelineIR` objects 

682 An `~collections.abc.Iterable` that contains one or more 

683 `PipelineIR` objects to merge into this object. 

684 

685 Raises 

686 ------ 

687 ValueError 

688 Raised if there is a conflict in instrument specifications. 

689 Raised if a task label appears in more than one of the input 

690 `PipelineIR` objects which are to be merged. 

691 Raised if a labeled subset appears in more than one of the input 

692 `PipelineIR` objects which are to be merged, and with any subset 

693 existing in this object. 

694 """ 

695 # integrate any imported pipelines 

696 accumulate_tasks: dict[str, TaskIR] = {} 

697 accumulate_labeled_subsets: dict[str, LabeledSubset] = {} 

698 accumulated_parameters = ParametersIR({}) 

699 

700 for tmp_IR in pipelines: 

701 if self.instrument is None: 

702 self.instrument = tmp_IR.instrument 

703 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

704 msg = ( 

705 "Only one instrument can be declared in a pipeline or its imports. " 

706 f"Top level pipeline defines {self.instrument} but pipeline to merge " 

707 f"defines {tmp_IR.instrument}." 

708 ) 

709 raise ValueError(msg) 

710 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

711 msg = ( 

712 "Task labels in the imported pipelines must be unique. " 

713 f"These labels appear multiple times: {duplicate_labels}" 

714 ) 

715 raise ValueError(msg) 

716 accumulate_tasks.update(tmp_IR.tasks) 

717 self.contracts.extend(tmp_IR.contracts) 

718 # verify that tmp_IR has unique labels for named subset among 

719 # existing labeled subsets, and with existing task labels. 

720 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

721 task_subset_overlap = ( 

722 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

723 ) & accumulate_tasks.keys() 

724 if overlapping_subsets or task_subset_overlap: 

725 raise ValueError( 

726 "Labeled subset names must be unique amongst imports in both labels and " 

727 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

728 ) 

729 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

730 accumulated_parameters.update(tmp_IR.parameters) 

731 

732 # verify that any accumulated labeled subsets dont clash with a label 

733 # from this pipeline 

734 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

735 raise ValueError( 

736 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

737 ) 

738 # merge in the named subsets for self so this document can override any 

739 # that have been delcared 

740 accumulate_labeled_subsets.update(self.labeled_subsets) 

741 self.labeled_subsets = accumulate_labeled_subsets 

742 

743 # merge the dict of label:TaskIR objects, preserving any configs in the 

744 # imported pipeline if the labels point to the same class 

745 for label, task in self.tasks.items(): 

746 if label not in accumulate_tasks: 

747 accumulate_tasks[label] = task 

748 elif accumulate_tasks[label].klass == task.klass: 

749 if task.config is not None: 

750 for config in task.config: 

751 accumulate_tasks[label].add_or_update_config(config) 

752 else: 

753 accumulate_tasks[label] = task 

754 self.tasks: dict[str, TaskIR] = accumulate_tasks 

755 accumulated_parameters.update(self.parameters) 

756 self.parameters = accumulated_parameters 

757 

758 def _read_tasks(self, loaded_yaml: dict[str, Any]) -> None: 

759 """Process the tasks portion of the loaded yaml document 

760 

761 Parameters 

762 ---------- 

763 loaded_yaml : `dict` 

764 A dictionary which matches the structure that would be produced by 

765 a yaml reader which parses a pipeline definition document 

766 """ 

767 self.tasks = {} 

768 tmp_tasks = loaded_yaml.pop("tasks", None) 

769 if tmp_tasks is None: 

770 tmp_tasks = {} 

771 

772 if "parameters" in tmp_tasks: 

773 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

774 

775 for label, definition in tmp_tasks.items(): 

776 if isinstance(definition, str): 

777 definition = {"class": definition} 

778 config = definition.get("config", None) 

779 if config is None: 

780 task_config_ir = None 

781 else: 

782 if isinstance(config, dict): 

783 config = [config] 

784 task_config_ir = [] 

785 for c in config: 

786 file = c.pop("file", None) 

787 if file is None: 

788 file = [] 

789 elif not isinstance(file, list): 

790 file = [file] 

791 task_config_ir.append( 

792 ConfigIR( 

793 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

794 ) 

795 ) 

796 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

797 

798 def _remove_contracts(self, label: str) -> None: 

799 """Remove any contracts that contain the given label 

800 

801 String comparison used in this way is not the most elegant and may 

802 have issues, but it is the only feasible way when users can specify 

803 contracts with generic strings. 

804 """ 

805 new_contracts = [] 

806 for contract in self.contracts: 

807 # match a label that is not preceded by an ASCII identifier, or 

808 # is the start of a line and is followed by a dot 

809 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

810 continue 

811 new_contracts.append(contract) 

812 self.contracts = new_contracts 

813 

814 def subset_from_labels(self, labelSpecifier: set[str]) -> PipelineIR: 

815 """Subset a pipelineIR to contain only labels specified in 

816 labelSpecifier. 

817 

818 Parameters 

819 ---------- 

820 labelSpecifier : `set` of `str` 

821 set containing labels that describes how to subset a pipeline. 

822 

823 Returns 

824 ------- 

825 pipeline : `PipelineIR` 

826 A new pipelineIR object that is a subset of the old pipelineIR 

827 

828 Raises 

829 ------ 

830 ValueError 

831 Raised if there is an issue with specified labels 

832 

833 Notes 

834 ----- 

835 This method attempts to prune any contracts that contain labels which 

836 are not in the declared subset of labels. This pruning is done using a 

837 string based matching due to the nature of contracts and may prune more 

838 than it should. Any labeled subsets defined that no longer have all 

839 members of the subset present in the pipeline will be removed from the 

840 resulting pipeline. 

841 """ 

842 pipeline = copy.deepcopy(self) 

843 

844 # update the label specifier to expand any named subsets 

845 toRemove = set() 

846 toAdd = set() 

847 for label in labelSpecifier: 

848 if label in pipeline.labeled_subsets: 

849 toRemove.add(label) 

850 toAdd.update(pipeline.labeled_subsets[label].subset) 

851 labelSpecifier.difference_update(toRemove) 

852 labelSpecifier.update(toAdd) 

853 # verify all the labels are in the pipeline 

854 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

855 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

856 raise ValueError( 

857 "Not all supplied labels (specified or named subsets) are in the pipeline " 

858 f"definition, extra labels: {difference}" 

859 ) 

860 # copy needed so as to not modify while iterating 

861 pipeline_labels = set(pipeline.tasks.keys()) 

862 # Remove the labels from the pipelineIR, and any contracts that contain 

863 # those labels (see docstring on _remove_contracts for why this may 

864 # cause issues) 

865 for label in pipeline_labels: 

866 if label not in labelSpecifier: 

867 pipeline.tasks.pop(label) 

868 pipeline._remove_contracts(label) 

869 

870 # create a copy of the object to iterate over 

871 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

872 # remove any labeled subsets that no longer have a complete set 

873 for label, labeled_subset in labeled_subsets.items(): 

874 if labeled_subset.subset - pipeline.tasks.keys(): 

875 pipeline.labeled_subsets.pop(label) 

876 

877 return pipeline 

878 

879 @classmethod 

880 def from_string(cls, pipeline_string: str) -> PipelineIR: 

881 """Create a `PipelineIR` object from a string formatted like a pipeline 

882 document 

883 

884 Parameters 

885 ---------- 

886 pipeline_string : `str` 

887 A string that is formatted according like a pipeline document 

888 """ 

889 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

890 return cls(loaded_yaml) 

891 

892 @classmethod 

893 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

894 """Create a `PipelineIR` object from the document specified by the 

895 input uri. 

896 

897 Parameters 

898 ---------- 

899 uri: convertible to `~lsst.resources.ResourcePath` 

900 Location of document to use in creating a `PipelineIR` object. 

901 

902 Returns 

903 ------- 

904 pipelineIR : `PipelineIR` 

905 The loaded pipeline 

906 """ 

907 loaded_uri = ResourcePath(uri) 

908 with loaded_uri.open("r") as buffer: 

909 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader) 

910 return cls(loaded_yaml) 

911 

912 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

913 """Serialize this `PipelineIR` object into a yaml formatted string and 

914 write the output to a file at the specified uri. 

915 

916 Parameters 

917 ---------- 

918 uri: convertible to `~lsst.resources.ResourcePath` 

919 Location of document to write a `PipelineIR` object. 

920 """ 

921 with ResourcePath(uri).open("w") as buffer: 

922 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

923 

924 def to_primitives(self) -> dict[str, Any]: 

925 """Convert to a representation used in yaml serialization 

926 

927 Returns 

928 ------- 

929 primitives : `dict` 

930 dictionary that maps directly to the serialized YAML form. 

931 """ 

932 accumulate = {"description": self.description} 

933 if self.instrument is not None: 

934 accumulate["instrument"] = self.instrument 

935 if self.parameters: 

936 accumulate["parameters"] = self.parameters.to_primitives() 

937 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

938 if len(self.contracts) > 0: 

939 # sort contracts lexicographical order by the contract string in 

940 # absence of any other ordering principle 

941 contracts_list = [c.to_primitives() for c in self.contracts] 

942 contracts_list.sort(key=lambda x: x["contract"]) 

943 accumulate["contracts"] = contracts_list 

944 if self.labeled_subsets: 

945 accumulate["subsets"] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

946 return accumulate 

947 

948 def __str__(self) -> str: 

949 """Instance formatting as how it would look in yaml representation""" 

950 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

951 

952 def __repr__(self) -> str: 

953 """Instance formatting as how it would look in yaml representation""" 

954 return str(self) 

955 

956 def __eq__(self, other: object) -> bool: 

957 if not isinstance(other, PipelineIR): 

958 return False 

959 # special case contracts because it is a list, but order is not 

960 # important 

961 elif ( 

962 all( 

963 getattr(self, attr) == getattr(other, attr) 

964 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

965 ) 

966 and len(self.contracts) == len(other.contracts) 

967 and all(c in self.contracts for c in other.contracts) 

968 ): 

969 return True 

970 else: 

971 return False