Coverage for python/lsst/pipe/base/pipelineIR.py: 21%

387 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-23 10:31 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ConfigIR", 

25 "ContractError", 

26 "ContractIR", 

27 "ImportIR", 

28 "LabeledSubset", 

29 "ParametersIR", 

30 "PipelineIR", 

31 "TaskIR", 

32) 

33 

34import copy 

35import enum 

36import os 

37import re 

38import warnings 

39from collections import Counter 

40from collections.abc import Generator, Hashable, Iterable, MutableMapping 

41from dataclasses import dataclass, field 

42from typing import Any, Literal 

43 

44import yaml 

45from lsst.resources import ResourcePath, ResourcePathExpression 

46from lsst.utils.introspection import find_outside_stacklevel 

47 

48 

49class _Tags(enum.Enum): 

50 KeepInstrument = enum.auto() 

51 

52 

53class PipelineYamlLoader(yaml.SafeLoader): 

54 """Specialized version of yaml's SafeLoader. 

55 

56 It checks and raises an exception if it finds that there are multiple 

57 instances of the same key found inside a pipeline file at a given scope. 

58 """ 

59 

60 def construct_mapping(self, node: yaml.MappingNode, deep: bool = False) -> dict[Hashable, Any]: 

61 # do the call to super first so that it can do all the other forms of 

62 # checking on this node. If you check the uniqueness of keys first 

63 # it would save the work that super does in the case of a failure, but 

64 # it might fail in the case that the node was the incorrect node due 

65 # to a parsing error, and the resulting exception would be difficult to 

66 # understand. 

67 mapping = super().construct_mapping(node, deep) 

68 # Check if there are any duplicate keys 

69 all_keys = Counter(key_node.value for key_node, _ in node.value) 

70 duplicates = {k for k, i in all_keys.items() if i != 1} 

71 if duplicates: 

72 raise KeyError( 

73 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

74 ) 

75 return mapping 

76 

77 

78class MultilineStringDumper(yaml.Dumper): 

79 """Custom YAML dumper that makes multi-line strings use the '|' 

80 continuation style instead of unreadable newlines and tons of quotes. 

81 

82 Basic approach is taken from 

83 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

84 but is written as a Dumper subclass to make its effects non-global (vs 

85 `yaml.add_representer`). 

86 """ 

87 

88 def represent_scalar(self, tag: str, value: Any, style: str | None = None) -> yaml.ScalarNode: 

89 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

90 style = "|" 

91 return super().represent_scalar(tag, value, style) 

92 

93 

94class ContractError(Exception): 

95 """An exception that is raised when a pipeline contract is not satisfied""" 

96 

97 pass 

98 

99 

100@dataclass 

101class ContractIR: 

102 """Intermediate representation of configuration contracts read from a 

103 pipeline yaml file. 

104 """ 

105 

106 contract: str 

107 """A string of python code representing one or more conditions on configs 

108 in a pipeline. This code-as-string should, once evaluated, should be True 

109 if the configs are fine, and False otherwise. 

110 """ 

111 msg: str | None = None 

112 """An optional message to be shown to the user if a contract fails 

113 """ 

114 

115 def to_primitives(self) -> dict[str, str]: 

116 """Convert to a representation used in yaml serialization""" 

117 accumulate = {"contract": self.contract} 

118 if self.msg is not None: 

119 accumulate["msg"] = self.msg 

120 return accumulate 

121 

122 def __eq__(self, other: object) -> bool: 

123 if not isinstance(other, ContractIR): 

124 return False 

125 return self.contract == other.contract and self.msg == other.msg 

126 

127 

128@dataclass 

129class LabeledSubset: 

130 """Intermediate representation of named subset of task labels read from 

131 a pipeline yaml file. 

132 """ 

133 

134 label: str 

135 """The label used to identify the subset of task labels. 

136 """ 

137 subset: set[str] 

138 """A set of task labels contained in this subset. 

139 """ 

140 description: str | None 

141 """A description of what this subset of tasks is intended to do 

142 """ 

143 

144 @staticmethod 

145 def from_primitives(label: str, value: list[str] | dict) -> LabeledSubset: 

146 """Generate `LabeledSubset` objects given a properly formatted object 

147 that as been created by a yaml loader. 

148 

149 Parameters 

150 ---------- 

151 label : `str` 

152 The label that will be used to identify this labeled subset. 

153 value : `list` of `str` or `dict` 

154 Object returned from loading a labeled subset section from a yaml 

155 document. 

156 

157 Returns 

158 ------- 

159 labeledSubset : `LabeledSubset` 

160 A `LabeledSubset` object build from the inputs. 

161 

162 Raises 

163 ------ 

164 ValueError 

165 Raised if the value input is not properly formatted for parsing 

166 """ 

167 if isinstance(value, MutableMapping): 

168 subset = value.pop("subset", None) 

169 if subset is None: 

170 raise ValueError( 

171 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

172 ) 

173 description = value.pop("description", None) 

174 elif isinstance(value, Iterable): 

175 subset = value 

176 description = None 

177 else: 

178 raise ValueError( 

179 f"There was a problem parsing the labeled subset {label}, make sure the " 

180 "definition is either a valid yaml list, or a mapping with keys " 

181 "(subset, description) where subset points to a yaml list, and description is " 

182 "associated with a string" 

183 ) 

184 return LabeledSubset(label, set(subset), description) 

185 

186 def to_primitives(self) -> dict[str, list[str] | str]: 

187 """Convert to a representation used in yaml serialization.""" 

188 accumulate: dict[str, list[str] | str] = {"subset": list(self.subset)} 

189 if self.description is not None: 

190 accumulate["description"] = self.description 

191 return accumulate 

192 

193 

194@dataclass 

195class ParametersIR: 

196 """Intermediate representation of parameters that are global to a pipeline. 

197 

198 Parameters 

199 ---------- 

200 mapping : `dict` [`str`, `str`] 

201 A mutable mapping of identifiers as keys, and shared configuration 

202 as values. 

203 

204 Notes 

205 ----- 

206 These parameters are specified under a top level key named ``parameters`` 

207 and are declared as a yaml mapping. These entries can then be used inside 

208 task configuration blocks to specify configuration values. They may not be 

209 used in the special ``file`` or ``python`` blocks. 

210 

211 Examples 

212 -------- 

213 .. code-block:: yaml 

214 

215 \u200bparameters: 

216 shared_value: 14 

217 tasks: 

218 taskA: 

219 class: modA 

220 config: 

221 field1: parameters.shared_value 

222 taskB: 

223 class: modB 

224 config: 

225 field2: parameters.shared_value 

226 """ 

227 

228 mapping: MutableMapping[str, Any] 

229 """A mutable mapping of identifiers as keys, and shared configuration 

230 as values. 

231 """ 

232 

233 def update(self, other: ParametersIR | None) -> None: 

234 if other is not None: 

235 self.mapping.update(other.mapping) 

236 

237 def to_primitives(self) -> MutableMapping[str, str]: 

238 """Convert to a representation used in yaml serialization""" 

239 return self.mapping 

240 

241 def __contains__(self, value: str) -> bool: 

242 return value in self.mapping 

243 

244 def __getitem__(self, item: str) -> Any: 

245 return self.mapping[item] 

246 

247 def __bool__(self) -> bool: 

248 return bool(self.mapping) 

249 

250 

251@dataclass 

252class ConfigIR: 

253 """Intermediate representation of configurations read from a pipeline yaml 

254 file. 

255 """ 

256 

257 python: str | None = None 

258 """A string of python code that is used to modify a configuration. This can 

259 also be None if there are no modifications to do. 

260 """ 

261 dataId: dict | None = None 

262 """A dataId that is used to constrain these config overrides to only quanta 

263 with matching dataIds. This field can be None if there is no constraint. 

264 This is currently an unimplemented feature, and is placed here for future 

265 use. 

266 """ 

267 file: list[str] = field(default_factory=list) 

268 """A list of paths which points to a file containing config overrides to be 

269 applied. This value may be an empty list if there are no overrides to 

270 apply. 

271 """ 

272 rest: dict = field(default_factory=dict) 

273 """This is a dictionary of key value pairs, where the keys are strings 

274 corresponding to qualified fields on a config to override, and the values 

275 are strings representing the values to apply. 

276 """ 

277 

278 def to_primitives(self) -> dict[str, str | dict | list[str]]: 

279 """Convert to a representation used in yaml serialization""" 

280 accumulate = {} 

281 for name in ("python", "dataId", "file"): 

282 # if this attribute is thruthy add it to the accumulation 

283 # dictionary 

284 if getattr(self, name): 

285 accumulate[name] = getattr(self, name) 

286 # Add the dictionary containing the rest of the config keys to the 

287 # # accumulated dictionary 

288 accumulate.update(self.rest) 

289 return accumulate 

290 

291 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

292 """Return a new ConfigIR object that is formatted according to the 

293 specified parameters 

294 

295 Parameters 

296 ---------- 

297 parameters : `ParametersIR` 

298 Object that contains variable mappings used in substitution. 

299 

300 Returns 

301 ------- 

302 config : `ConfigIR` 

303 A new ConfigIR object formatted with the input parameters 

304 """ 

305 new_config = copy.deepcopy(self) 

306 for key, value in new_config.rest.items(): 

307 if not isinstance(value, str): 

308 continue 

309 match = re.match("parameters[.](.*)", value) 

310 if match and match.group(1) in parameters: 

311 new_config.rest[key] = parameters[match.group(1)] 

312 if match and match.group(1) not in parameters: 

313 warnings.warn( 

314 f"config {key} contains value {match.group(0)} which is formatted like a " 

315 "Pipeline parameter but was not found within the Pipeline, if this was not " 

316 "intentional, check for a typo", 

317 stacklevel=find_outside_stacklevel("lsst.pipe.base"), 

318 ) 

319 return new_config 

320 

321 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

322 """Merge another instance of a `ConfigIR` into this instance if 

323 possible. This function returns a generator that is either self 

324 if the configs were merged, or self, and other_config if that could 

325 not be merged. 

326 

327 Parameters 

328 ---------- 

329 other_config : `ConfigIR` 

330 An instance of `ConfigIR` to merge into this instance. 

331 

332 Returns 

333 ------- 

334 Generator : `ConfigIR` 

335 A generator containing either self, or self and other_config if 

336 the configs could be merged or not respectively. 

337 """ 

338 # Verify that the config blocks can be merged 

339 if ( 

340 self.dataId != other_config.dataId 

341 or self.python 

342 or other_config.python 

343 or self.file 

344 or other_config.file 

345 ): 

346 yield from (self, other_config) 

347 return 

348 

349 # create a set of all keys, and verify two keys do not have different 

350 # values 

351 key_union = self.rest.keys() & other_config.rest.keys() 

352 for key in key_union: 

353 if self.rest[key] != other_config.rest[key]: 

354 yield from (self, other_config) 

355 return 

356 self.rest.update(other_config.rest) 

357 

358 # Combine the lists of override files to load 

359 self_file_set = set(self.file) 

360 other_file_set = set(other_config.file) 

361 self.file = list(self_file_set.union(other_file_set)) 

362 

363 yield self 

364 

365 def __eq__(self, other: object) -> bool: 

366 if not isinstance(other, ConfigIR): 

367 return False 

368 return all( 

369 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

370 ) 

371 

372 

373@dataclass 

374class TaskIR: 

375 """Intermediate representation of tasks read from a pipeline yaml file.""" 

376 

377 label: str 

378 """An identifier used to refer to a task. 

379 """ 

380 klass: str 

381 """A string containing a fully qualified python class to be run in a 

382 pipeline. 

383 """ 

384 config: list[ConfigIR] | None = None 

385 """list of all configs overrides associated with this task, and may be 

386 `None` if there are no config overrides. 

387 """ 

388 

389 def to_primitives(self) -> dict[str, str | list[dict]]: 

390 """Convert to a representation used in yaml serialization""" 

391 accumulate: dict[str, str | list[dict]] = {"class": self.klass} 

392 if self.config: 

393 accumulate["config"] = [c.to_primitives() for c in self.config] 

394 return accumulate 

395 

396 def add_or_update_config(self, other_config: ConfigIR) -> None: 

397 """Add a `ConfigIR` to this task if one is not present. Merges configs 

398 if there is a `ConfigIR` present and the dataId keys of both configs 

399 match, otherwise adds a new entry to the config list. The exception to 

400 the above is that if either the last config or other_config has a 

401 python block, then other_config is always added, as python blocks can 

402 modify configs in ways that cannot be predicted. 

403 

404 Parameters 

405 ---------- 

406 other_config : `ConfigIR` 

407 A `ConfigIR` instance to add or merge into the config attribute of 

408 this task. 

409 """ 

410 if not self.config: 

411 self.config = [other_config] 

412 return 

413 self.config.extend(self.config.pop().maybe_merge(other_config)) 

414 

415 def __eq__(self, other: object) -> bool: 

416 if not isinstance(other, TaskIR): 

417 return False 

418 return all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")) 

419 

420 

421@dataclass 

422class ImportIR: 

423 """An intermediate representation of imported pipelines""" 

424 

425 location: str 

426 """This is the location of the pipeline to inherit. The path should be 

427 specified as an absolute path. Environment variables may be used in the 

428 path and should be specified as a python string template, with the name of 

429 the environment variable inside braces. 

430 """ 

431 include: list[str] | None = None 

432 """list of tasks that should be included when inheriting this pipeline. 

433 Either the include or exclude attributes may be specified, but not both. 

434 """ 

435 exclude: list[str] | None = None 

436 """list of tasks that should be excluded when inheriting this pipeline. 

437 Either the include or exclude attributes may be specified, but not both. 

438 """ 

439 importContracts: bool = True 

440 """Boolean attribute to dictate if contracts should be inherited with the 

441 pipeline or not. 

442 """ 

443 instrument: Literal[_Tags.KeepInstrument] | str | None = _Tags.KeepInstrument 

444 """Instrument to assign to the Pipeline at import. The default value of 

445 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

446 declared with will not be modified. setting this value to None will drop 

447 any declared instrument prior to import. 

448 """ 

449 

450 def toPipelineIR(self) -> "PipelineIR": 

451 """Load in the Pipeline specified by this object, and turn it into a 

452 PipelineIR instance. 

453 

454 Returns 

455 ------- 

456 pipeline : `PipelineIR` 

457 A pipeline generated from the imported pipeline file 

458 """ 

459 if self.include and self.exclude: 

460 raise ValueError( 

461 "An include list and an exclude list cannot both be specified" 

462 " when declaring a pipeline import." 

463 ) 

464 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

465 if self.instrument is not _Tags.KeepInstrument: 

466 tmp_pipeline.instrument = self.instrument 

467 

468 included_labels = set() 

469 for label in tmp_pipeline.tasks: 

470 if ( 

471 (self.include and label in self.include) 

472 or (self.exclude and label not in self.exclude) 

473 or (self.include is None and self.exclude is None) 

474 ): 

475 included_labels.add(label) 

476 

477 # Handle labeled subsets being specified in the include or exclude 

478 # list, adding or removing labels. 

479 if self.include is not None: 

480 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

481 for label in subsets_in_include: 

482 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

483 

484 elif self.exclude is not None: 

485 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

486 for label in subsets_in_exclude: 

487 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

488 

489 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels) 

490 

491 if not self.importContracts: 

492 tmp_pipeline.contracts = [] 

493 

494 return tmp_pipeline 

495 

496 def __eq__(self, other: object) -> bool: 

497 if not isinstance(other, ImportIR): 

498 return False 

499 return all( 

500 getattr(self, attr) == getattr(other, attr) 

501 for attr in ("location", "include", "exclude", "importContracts") 

502 ) 

503 

504 

505class PipelineIR: 

506 """Intermediate representation of a pipeline definition 

507 

508 Parameters 

509 ---------- 

510 loaded_yaml : `dict` 

511 A dictionary which matches the structure that would be produced by a 

512 yaml reader which parses a pipeline definition document 

513 

514 Raises 

515 ------ 

516 ValueError 

517 Raised if: 

518 

519 - a pipeline is declared without a description; 

520 - no tasks are declared in a pipeline, and no pipelines are to be 

521 inherited; 

522 - more than one instrument is specified; 

523 - more than one inherited pipeline share a label. 

524 """ 

525 

526 def __init__(self, loaded_yaml: dict[str, Any]): 

527 # Check required fields are present 

528 if "description" not in loaded_yaml: 

529 raise ValueError("A pipeline must be declared with a description") 

530 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

531 raise ValueError("A pipeline must be declared with one or more tasks") 

532 

533 # These steps below must happen in this call order 

534 

535 # Process pipeline description 

536 self.description = loaded_yaml.pop("description") 

537 

538 # Process tasks 

539 self._read_tasks(loaded_yaml) 

540 

541 # Process instrument keys 

542 inst = loaded_yaml.pop("instrument", None) 

543 if isinstance(inst, list): 

544 raise ValueError("Only one top level instrument can be defined in a pipeline") 

545 self.instrument: str | None = inst 

546 

547 # Process any contracts 

548 self._read_contracts(loaded_yaml) 

549 

550 # Process any defined parameters 

551 self._read_parameters(loaded_yaml) 

552 

553 # Process any named label subsets 

554 self._read_labeled_subsets(loaded_yaml) 

555 

556 # Process any inherited pipelines 

557 self._read_imports(loaded_yaml) 

558 

559 # verify named subsets, must be done after inheriting 

560 self._verify_labeled_subsets() 

561 

562 def _read_contracts(self, loaded_yaml: dict[str, Any]) -> None: 

563 """Process the contracts portion of the loaded yaml document 

564 

565 Parameters 

566 ---------- 

567 loaded_yaml : `dict` 

568 A dictionary which matches the structure that would be produced by 

569 a yaml reader which parses a pipeline definition document 

570 """ 

571 loaded_contracts = loaded_yaml.pop("contracts", []) 

572 if isinstance(loaded_contracts, str): 

573 loaded_contracts = [loaded_contracts] 

574 self.contracts: list[ContractIR] = [] 

575 for contract in loaded_contracts: 

576 if isinstance(contract, dict): 

577 self.contracts.append(ContractIR(**contract)) 

578 if isinstance(contract, str): 

579 self.contracts.append(ContractIR(contract=contract)) 

580 

581 def _read_parameters(self, loaded_yaml: dict[str, Any]) -> None: 

582 """Process the parameters portion of the loaded yaml document 

583 

584 Parameters 

585 ---------- 

586 loaded_yaml : `dict` 

587 A dictionary which matches the structure that would be produced by 

588 a yaml reader which parses a pipeline definition document 

589 """ 

590 loaded_parameters = loaded_yaml.pop("parameters", {}) 

591 if not isinstance(loaded_parameters, dict): 

592 raise ValueError("The parameters section must be a yaml mapping") 

593 self.parameters = ParametersIR(loaded_parameters) 

594 

595 def _read_labeled_subsets(self, loaded_yaml: dict[str, Any]) -> None: 

596 """Process the subsets portion of the loaded yaml document 

597 

598 Parameters 

599 ---------- 

600 loaded_yaml: `MutableMapping` 

601 A dictionary which matches the structure that would be produced 

602 by a yaml reader which parses a pipeline definition document 

603 """ 

604 loaded_subsets = loaded_yaml.pop("subsets", {}) 

605 self.labeled_subsets: dict[str, LabeledSubset] = {} 

606 if not loaded_subsets and "subset" in loaded_yaml: 

607 raise ValueError("Top level key should be subsets and not subset, add an s") 

608 for key, value in loaded_subsets.items(): 

609 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

610 

611 def _verify_labeled_subsets(self) -> None: 

612 """Verify that all the labels in each named subset exist within the 

613 pipeline. 

614 """ 

615 # Verify that all labels defined in a labeled subset are in the 

616 # Pipeline 

617 for labeled_subset in self.labeled_subsets.values(): 

618 if not labeled_subset.subset.issubset(self.tasks.keys()): 

619 raise ValueError( 

620 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

621 "declared pipeline" 

622 ) 

623 # Verify subset labels are not already task labels 

624 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

625 if label_intersection: 

626 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

627 

628 def _read_imports(self, loaded_yaml: dict[str, Any]) -> None: 

629 """Process the inherits portion of the loaded yaml document 

630 

631 Parameters 

632 ---------- 

633 loaded_yaml : `dict` 

634 A dictionary which matches the structure that would be produced by 

635 a yaml reader which parses a pipeline definition document 

636 """ 

637 

638 def process_args(argument: str | dict) -> dict: 

639 if isinstance(argument, str): 

640 return {"location": argument} 

641 elif isinstance(argument, dict): 

642 if "exclude" in argument and isinstance(argument["exclude"], str): 

643 argument["exclude"] = [argument["exclude"]] 

644 if "include" in argument and isinstance(argument["include"], str): 

645 argument["include"] = [argument["include"]] 

646 if "instrument" in argument and argument["instrument"] == "None": 

647 argument["instrument"] = None 

648 return argument 

649 

650 if not {"inherits", "imports"} - loaded_yaml.keys(): 

651 raise ValueError("Cannot define both inherits and imports sections, use imports") 

652 tmp_import = loaded_yaml.pop("inherits", None) 

653 if tmp_import is None: 

654 tmp_import = loaded_yaml.pop("imports", None) 

655 else: 

656 raise ValueError("The 'inherits' key is not supported. Please use the key 'imports' instead") 

657 if tmp_import is None: 

658 self.imports: list[ImportIR] = [] 

659 elif isinstance(tmp_import, list): 

660 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

661 else: 

662 self.imports = [ImportIR(**process_args(tmp_import))] 

663 

664 self.merge_pipelines([fragment.toPipelineIR() for fragment in self.imports]) 

665 

666 def merge_pipelines(self, pipelines: Iterable[PipelineIR]) -> None: 

667 """Merge one or more other `PipelineIR` objects into this object. 

668 

669 Parameters 

670 ---------- 

671 pipelines : `~collections.abc.Iterable` of `PipelineIR` objects 

672 An `~collections.abc.Iterable` that contains one or more 

673 `PipelineIR` objects to merge into this object. 

674 

675 Raises 

676 ------ 

677 ValueError 

678 Raised if there is a conflict in instrument specifications. 

679 Raised if a task label appears in more than one of the input 

680 `PipelineIR` objects which are to be merged. 

681 Raised if a labeled subset appears in more than one of the input 

682 `PipelineIR` objects which are to be merged, and with any subset 

683 existing in this object. 

684 """ 

685 # integrate any imported pipelines 

686 accumulate_tasks: dict[str, TaskIR] = {} 

687 accumulate_labeled_subsets: dict[str, LabeledSubset] = {} 

688 accumulated_parameters = ParametersIR({}) 

689 

690 for tmp_IR in pipelines: 

691 if self.instrument is None: 

692 self.instrument = tmp_IR.instrument 

693 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

694 msg = ( 

695 "Only one instrument can be declared in a pipeline or its imports. " 

696 f"Top level pipeline defines {self.instrument} but pipeline to merge " 

697 f"defines {tmp_IR.instrument}." 

698 ) 

699 raise ValueError(msg) 

700 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

701 msg = ( 

702 "Task labels in the imported pipelines must be unique. " 

703 f"These labels appear multiple times: {duplicate_labels}" 

704 ) 

705 raise ValueError(msg) 

706 accumulate_tasks.update(tmp_IR.tasks) 

707 self.contracts.extend(tmp_IR.contracts) 

708 # verify that tmp_IR has unique labels for named subset among 

709 # existing labeled subsets, and with existing task labels. 

710 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

711 task_subset_overlap = ( 

712 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

713 ) & accumulate_tasks.keys() 

714 if overlapping_subsets or task_subset_overlap: 

715 raise ValueError( 

716 "Labeled subset names must be unique amongst imports in both labels and " 

717 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

718 ) 

719 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

720 accumulated_parameters.update(tmp_IR.parameters) 

721 

722 # verify that any accumulated labeled subsets dont clash with a label 

723 # from this pipeline 

724 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

725 raise ValueError( 

726 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

727 ) 

728 # merge in the named subsets for self so this document can override any 

729 # that have been delcared 

730 accumulate_labeled_subsets.update(self.labeled_subsets) 

731 self.labeled_subsets = accumulate_labeled_subsets 

732 

733 # merge the dict of label:TaskIR objects, preserving any configs in the 

734 # imported pipeline if the labels point to the same class 

735 for label, task in self.tasks.items(): 

736 if label not in accumulate_tasks: 

737 accumulate_tasks[label] = task 

738 elif accumulate_tasks[label].klass == task.klass: 

739 if task.config is not None: 

740 for config in task.config: 

741 accumulate_tasks[label].add_or_update_config(config) 

742 else: 

743 accumulate_tasks[label] = task 

744 self.tasks: dict[str, TaskIR] = accumulate_tasks 

745 accumulated_parameters.update(self.parameters) 

746 self.parameters = accumulated_parameters 

747 

748 def _read_tasks(self, loaded_yaml: dict[str, Any]) -> None: 

749 """Process the tasks portion of the loaded yaml document 

750 

751 Parameters 

752 ---------- 

753 loaded_yaml : `dict` 

754 A dictionary which matches the structure that would be produced by 

755 a yaml reader which parses a pipeline definition document 

756 """ 

757 self.tasks = {} 

758 tmp_tasks = loaded_yaml.pop("tasks", None) 

759 if tmp_tasks is None: 

760 tmp_tasks = {} 

761 

762 if "parameters" in tmp_tasks: 

763 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

764 

765 for label, definition in tmp_tasks.items(): 

766 if isinstance(definition, str): 

767 definition = {"class": definition} 

768 config = definition.get("config", None) 

769 if config is None: 

770 task_config_ir = None 

771 else: 

772 if isinstance(config, dict): 

773 config = [config] 

774 task_config_ir = [] 

775 for c in config: 

776 file = c.pop("file", None) 

777 if file is None: 

778 file = [] 

779 elif not isinstance(file, list): 

780 file = [file] 

781 task_config_ir.append( 

782 ConfigIR( 

783 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

784 ) 

785 ) 

786 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

787 

788 def _remove_contracts(self, label: str) -> None: 

789 """Remove any contracts that contain the given label 

790 

791 String comparison used in this way is not the most elegant and may 

792 have issues, but it is the only feasible way when users can specify 

793 contracts with generic strings. 

794 """ 

795 new_contracts = [] 

796 for contract in self.contracts: 

797 # match a label that is not preceded by an ASCII identifier, or 

798 # is the start of a line and is followed by a dot 

799 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

800 continue 

801 new_contracts.append(contract) 

802 self.contracts = new_contracts 

803 

804 def subset_from_labels(self, labelSpecifier: set[str]) -> PipelineIR: 

805 """Subset a pipelineIR to contain only labels specified in 

806 labelSpecifier. 

807 

808 Parameters 

809 ---------- 

810 labelSpecifier : `set` of `str` 

811 set containing labels that describes how to subset a pipeline. 

812 

813 Returns 

814 ------- 

815 pipeline : `PipelineIR` 

816 A new pipelineIR object that is a subset of the old pipelineIR 

817 

818 Raises 

819 ------ 

820 ValueError 

821 Raised if there is an issue with specified labels 

822 

823 Notes 

824 ----- 

825 This method attempts to prune any contracts that contain labels which 

826 are not in the declared subset of labels. This pruning is done using a 

827 string based matching due to the nature of contracts and may prune more 

828 than it should. Any labeled subsets defined that no longer have all 

829 members of the subset present in the pipeline will be removed from the 

830 resulting pipeline. 

831 """ 

832 pipeline = copy.deepcopy(self) 

833 

834 # update the label specifier to expand any named subsets 

835 toRemove = set() 

836 toAdd = set() 

837 for label in labelSpecifier: 

838 if label in pipeline.labeled_subsets: 

839 toRemove.add(label) 

840 toAdd.update(pipeline.labeled_subsets[label].subset) 

841 labelSpecifier.difference_update(toRemove) 

842 labelSpecifier.update(toAdd) 

843 # verify all the labels are in the pipeline 

844 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

845 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

846 raise ValueError( 

847 "Not all supplied labels (specified or named subsets) are in the pipeline " 

848 f"definition, extra labels: {difference}" 

849 ) 

850 # copy needed so as to not modify while iterating 

851 pipeline_labels = set(pipeline.tasks.keys()) 

852 # Remove the labels from the pipelineIR, and any contracts that contain 

853 # those labels (see docstring on _remove_contracts for why this may 

854 # cause issues) 

855 for label in pipeline_labels: 

856 if label not in labelSpecifier: 

857 pipeline.tasks.pop(label) 

858 pipeline._remove_contracts(label) 

859 

860 # create a copy of the object to iterate over 

861 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

862 # remove any labeled subsets that no longer have a complete set 

863 for label, labeled_subset in labeled_subsets.items(): 

864 if labeled_subset.subset - pipeline.tasks.keys(): 

865 pipeline.labeled_subsets.pop(label) 

866 

867 return pipeline 

868 

869 @classmethod 

870 def from_string(cls, pipeline_string: str) -> PipelineIR: 

871 """Create a `PipelineIR` object from a string formatted like a pipeline 

872 document 

873 

874 Parameters 

875 ---------- 

876 pipeline_string : `str` 

877 A string that is formatted according like a pipeline document 

878 """ 

879 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

880 return cls(loaded_yaml) 

881 

882 @classmethod 

883 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

884 """Create a `PipelineIR` object from the document specified by the 

885 input uri. 

886 

887 Parameters 

888 ---------- 

889 uri: convertible to `~lsst.resources.ResourcePath` 

890 Location of document to use in creating a `PipelineIR` object. 

891 

892 Returns 

893 ------- 

894 pipelineIR : `PipelineIR` 

895 The loaded pipeline 

896 """ 

897 loaded_uri = ResourcePath(uri) 

898 with loaded_uri.open("r") as buffer: 

899 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader) 

900 return cls(loaded_yaml) 

901 

902 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

903 """Serialize this `PipelineIR` object into a yaml formatted string and 

904 write the output to a file at the specified uri. 

905 

906 Parameters 

907 ---------- 

908 uri: convertible to `~lsst.resources.ResourcePath` 

909 Location of document to write a `PipelineIR` object. 

910 """ 

911 with ResourcePath(uri).open("w") as buffer: 

912 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

913 

914 def to_primitives(self) -> dict[str, Any]: 

915 """Convert to a representation used in yaml serialization 

916 

917 Returns 

918 ------- 

919 primitives : `dict` 

920 dictionary that maps directly to the serialized YAML form. 

921 """ 

922 accumulate = {"description": self.description} 

923 if self.instrument is not None: 

924 accumulate["instrument"] = self.instrument 

925 if self.parameters: 

926 accumulate["parameters"] = self.parameters.to_primitives() 

927 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

928 if len(self.contracts) > 0: 

929 # sort contracts lexicographical order by the contract string in 

930 # absence of any other ordering principle 

931 contracts_list = [c.to_primitives() for c in self.contracts] 

932 contracts_list.sort(key=lambda x: x["contract"]) 

933 accumulate["contracts"] = contracts_list 

934 if self.labeled_subsets: 

935 accumulate["subsets"] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

936 return accumulate 

937 

938 def __str__(self) -> str: 

939 """Instance formatting as how it would look in yaml representation""" 

940 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

941 

942 def __repr__(self) -> str: 

943 """Instance formatting as how it would look in yaml representation""" 

944 return str(self) 

945 

946 def __eq__(self, other: object) -> bool: 

947 if not isinstance(other, PipelineIR): 

948 return False 

949 # special case contracts because it is a list, but order is not 

950 # important 

951 return ( 

952 all( 

953 getattr(self, attr) == getattr(other, attr) 

954 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

955 ) 

956 and len(self.contracts) == len(other.contracts) 

957 and all(c in self.contracts for c in other.contracts) 

958 )