Coverage for python/lsst/pipe/base/pipelineIR.py: 20%

407 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-27 02:39 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("ConfigIR", "ContractError", "ContractIR", "ImportIR", "PipelineIR", "TaskIR", "LabeledSubset") 

24 

25import copy 

26import enum 

27import os 

28import re 

29import warnings 

30from collections import Counter 

31from collections.abc import Iterable as abcIterable 

32from dataclasses import dataclass, field 

33from typing import Any, Dict, Generator, List, Literal, Mapping, MutableMapping, Optional, Set, Union 

34 

35import yaml 

36from lsst.resources import ResourcePath, ResourcePathExpression 

37 

38 

39class _Tags(enum.Enum): 

40 KeepInstrument = enum.auto() 

41 

42 

43class PipelineYamlLoader(yaml.SafeLoader): 

44 """This is a specialized version of yaml's SafeLoader. It checks and raises 

45 an exception if it finds that there are multiple instances of the same key 

46 found inside a pipeline file at a given scope. 

47 """ 

48 

49 def construct_mapping(self, node: yaml.Node, deep: bool = False) -> Mapping[str, Any]: 

50 # do the call to super first so that it can do all the other forms of 

51 # checking on this node. If you check the uniqueness of keys first 

52 # it would save the work that super does in the case of a failure, but 

53 # it might fail in the case that the node was the incorrect node due 

54 # to a parsing error, and the resulting exception would be difficult to 

55 # understand. 

56 mapping = super().construct_mapping(node, deep) 

57 # Check if there are any duplicate keys 

58 all_keys = Counter(key_node.value for key_node, _ in node.value) 

59 duplicates = {k for k, i in all_keys.items() if i != 1} 

60 if duplicates: 

61 raise KeyError( 

62 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

63 ) 

64 return mapping 

65 

66 

67class MultilineStringDumper(yaml.Dumper): 

68 """Custom YAML dumper that makes multi-line strings use the '|' 

69 continuation style instead of unreadable newlines and tons of quotes. 

70 

71 Basic approach is taken from 

72 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

73 but is written as a Dumper subclass to make its effects non-global (vs 

74 `yaml.add_representer`). 

75 """ 

76 

77 def represent_scalar(self, tag: str, value: Any, style: Optional[str] = None) -> yaml.ScalarNode: 

78 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

79 style = "|" 

80 return super().represent_scalar(tag, value, style) 

81 

82 

83class ContractError(Exception): 

84 """An exception that is raised when a pipeline contract is not satisfied""" 

85 

86 pass 

87 

88 

89@dataclass 

90class ContractIR: 

91 """Intermediate representation of configuration contracts read from a 

92 pipeline yaml file.""" 

93 

94 contract: str 

95 """A string of python code representing one or more conditions on configs 

96 in a pipeline. This code-as-string should, once evaluated, should be True 

97 if the configs are fine, and False otherwise. 

98 """ 

99 msg: Union[str, None] = None 

100 """An optional message to be shown to the user if a contract fails 

101 """ 

102 

103 def to_primitives(self) -> Dict[str, str]: 

104 """Convert to a representation used in yaml serialization""" 

105 accumulate = {"contract": self.contract} 

106 if self.msg is not None: 

107 accumulate["msg"] = self.msg 

108 return accumulate 

109 

110 def __eq__(self, other: object) -> bool: 

111 if not isinstance(other, ContractIR): 

112 return False 

113 elif self.contract == other.contract and self.msg == other.msg: 

114 return True 

115 else: 

116 return False 

117 

118 

119@dataclass 

120class LabeledSubset: 

121 """Intermediate representation of named subset of task labels read from 

122 a pipeline yaml file. 

123 """ 

124 

125 label: str 

126 """The label used to identify the subset of task labels. 

127 """ 

128 subset: Set[str] 

129 """A set of task labels contained in this subset. 

130 """ 

131 description: Optional[str] 

132 """A description of what this subset of tasks is intended to do 

133 """ 

134 

135 @staticmethod 

136 def from_primitives(label: str, value: Union[List[str], dict]) -> LabeledSubset: 

137 """Generate `LabeledSubset` objects given a properly formatted object 

138 that as been created by a yaml loader. 

139 

140 Parameters 

141 ---------- 

142 label : `str` 

143 The label that will be used to identify this labeled subset. 

144 value : `list` of `str` or `dict` 

145 Object returned from loading a labeled subset section from a yaml 

146 document. 

147 

148 Returns 

149 ------- 

150 labeledSubset : `LabeledSubset` 

151 A `LabeledSubset` object build from the inputs. 

152 

153 Raises 

154 ------ 

155 ValueError 

156 Raised if the value input is not properly formatted for parsing 

157 """ 

158 if isinstance(value, MutableMapping): 

159 subset = value.pop("subset", None) 

160 if subset is None: 

161 raise ValueError( 

162 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

163 ) 

164 description = value.pop("description", None) 

165 elif isinstance(value, abcIterable): 

166 subset = value 

167 description = None 

168 else: 

169 raise ValueError( 

170 f"There was a problem parsing the labeled subset {label}, make sure the " 

171 "definition is either a valid yaml list, or a mapping with keys " 

172 "(subset, description) where subset points to a yaml list, and description is " 

173 "associated with a string" 

174 ) 

175 return LabeledSubset(label, set(subset), description) 

176 

177 def to_primitives(self) -> Dict[str, Union[List[str], str]]: 

178 """Convert to a representation used in yaml serialization""" 

179 accumulate: Dict[str, Union[List[str], str]] = {"subset": list(self.subset)} 

180 if self.description is not None: 

181 accumulate["description"] = self.description 

182 return accumulate 

183 

184 

185@dataclass 

186class ParametersIR: 

187 """Intermediate representation of parameters that are global to a pipeline 

188 

189 These parameters are specified under a top level key named `parameters` 

190 and are declared as a yaml mapping. These entries can then be used inside 

191 task configuration blocks to specify configuration values. They may not be 

192 used in the special ``file`` or ``python`` blocks. 

193 

194 Example: 

195 paramters: 

196 shared_value: 14 

197 tasks: 

198 taskA: 

199 class: modA 

200 config: 

201 field1: parameters.shared_value 

202 taskB: 

203 class: modB 

204 config: 

205 field2: parameters.shared_value 

206 """ 

207 

208 mapping: MutableMapping[str, str] 

209 """A mutable mapping of identifiers as keys, and shared configuration 

210 as values. 

211 """ 

212 

213 def update(self, other: Optional[ParametersIR]) -> None: 

214 if other is not None: 

215 self.mapping.update(other.mapping) 

216 

217 def to_primitives(self) -> MutableMapping[str, str]: 

218 """Convert to a representation used in yaml serialization""" 

219 return self.mapping 

220 

221 def __contains__(self, value: str) -> bool: 

222 return value in self.mapping 

223 

224 def __getitem__(self, item: str) -> Any: 

225 return self.mapping[item] 

226 

227 def __bool__(self) -> bool: 

228 return bool(self.mapping) 

229 

230 

231@dataclass 

232class ConfigIR: 

233 """Intermediate representation of configurations read from a pipeline yaml 

234 file. 

235 """ 

236 

237 python: Union[str, None] = None 

238 """A string of python code that is used to modify a configuration. This can 

239 also be None if there are no modifications to do. 

240 """ 

241 dataId: Union[dict, None] = None 

242 """A dataId that is used to constrain these config overrides to only quanta 

243 with matching dataIds. This field can be None if there is no constraint. 

244 This is currently an unimplemented feature, and is placed here for future 

245 use. 

246 """ 

247 file: List[str] = field(default_factory=list) 

248 """A list of paths which points to a file containing config overrides to be 

249 applied. This value may be an empty list if there are no overrides to 

250 apply. 

251 """ 

252 rest: dict = field(default_factory=dict) 

253 """This is a dictionary of key value pairs, where the keys are strings 

254 corresponding to qualified fields on a config to override, and the values 

255 are strings representing the values to apply. 

256 """ 

257 

258 def to_primitives(self) -> Dict[str, Union[str, dict, List[str]]]: 

259 """Convert to a representation used in yaml serialization""" 

260 accumulate = {} 

261 for name in ("python", "dataId", "file"): 

262 # if this attribute is thruthy add it to the accumulation 

263 # dictionary 

264 if getattr(self, name): 

265 accumulate[name] = getattr(self, name) 

266 # Add the dictionary containing the rest of the config keys to the 

267 # # accumulated dictionary 

268 accumulate.update(self.rest) 

269 return accumulate 

270 

271 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

272 """Returns a new ConfigIR object that is formatted according to the 

273 specified parameters 

274 

275 Parameters 

276 ---------- 

277 parameters : ParametersIR 

278 Object that contains variable mappings used in substitution. 

279 

280 Returns 

281 ------- 

282 config : ConfigIR 

283 A new ConfigIR object formatted with the input parameters 

284 """ 

285 new_config = copy.deepcopy(self) 

286 for key, value in new_config.rest.items(): 

287 if not isinstance(value, str): 

288 continue 

289 match = re.match("parameters[.](.*)", value) 

290 if match and match.group(1) in parameters: 

291 new_config.rest[key] = parameters[match.group(1)] 

292 if match and match.group(1) not in parameters: 

293 warnings.warn( 

294 f"config {key} contains value {match.group(0)} which is formatted like a " 

295 "Pipeline parameter but was not found within the Pipeline, if this was not " 

296 "intentional, check for a typo" 

297 ) 

298 return new_config 

299 

300 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

301 """Merges another instance of a `ConfigIR` into this instance if 

302 possible. This function returns a generator that is either self 

303 if the configs were merged, or self, and other_config if that could 

304 not be merged. 

305 

306 Parameters 

307 ---------- 

308 other_config : `ConfigIR` 

309 An instance of `ConfigIR` to merge into this instance. 

310 

311 Returns 

312 ------- 

313 Generator : `ConfigIR` 

314 A generator containing either self, or self and other_config if 

315 the configs could be merged or not respectively. 

316 """ 

317 # Verify that the config blocks can be merged 

318 if ( 

319 self.dataId != other_config.dataId 

320 or self.python 

321 or other_config.python 

322 or self.file 

323 or other_config.file 

324 ): 

325 yield from (self, other_config) 

326 return 

327 

328 # create a set of all keys, and verify two keys do not have different 

329 # values 

330 key_union = self.rest.keys() & other_config.rest.keys() 

331 for key in key_union: 

332 if self.rest[key] != other_config.rest[key]: 

333 yield from (self, other_config) 

334 return 

335 self.rest.update(other_config.rest) 

336 

337 # Combine the lists of override files to load 

338 self_file_set = set(self.file) 

339 other_file_set = set(other_config.file) 

340 self.file = list(self_file_set.union(other_file_set)) 

341 

342 yield self 

343 

344 def __eq__(self, other: object) -> bool: 

345 if not isinstance(other, ConfigIR): 

346 return False 

347 elif all( 

348 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

349 ): 

350 return True 

351 else: 

352 return False 

353 

354 

355@dataclass 

356class TaskIR: 

357 """Intermediate representation of tasks read from a pipeline yaml file.""" 

358 

359 label: str 

360 """An identifier used to refer to a task. 

361 """ 

362 klass: str 

363 """A string containing a fully qualified python class to be run in a 

364 pipeline. 

365 """ 

366 config: Union[List[ConfigIR], None] = None 

367 """List of all configs overrides associated with this task, and may be 

368 `None` if there are no config overrides. 

369 """ 

370 

371 def to_primitives(self) -> Dict[str, Union[str, List[dict]]]: 

372 """Convert to a representation used in yaml serialization""" 

373 accumulate: Dict[str, Union[str, List[dict]]] = {"class": self.klass} 

374 if self.config: 

375 accumulate["config"] = [c.to_primitives() for c in self.config] 

376 return accumulate 

377 

378 def add_or_update_config(self, other_config: ConfigIR) -> None: 

379 """Adds a `ConfigIR` to this task if one is not present. Merges configs 

380 if there is a `ConfigIR` present and the dataId keys of both configs 

381 match, otherwise adds a new entry to the config list. The exception to 

382 the above is that if either the last config or other_config has a 

383 python block, then other_config is always added, as python blocks can 

384 modify configs in ways that cannot be predicted. 

385 

386 Parameters 

387 ---------- 

388 other_config : `ConfigIR` 

389 A `ConfigIR` instance to add or merge into the config attribute of 

390 this task. 

391 """ 

392 if not self.config: 

393 self.config = [other_config] 

394 return 

395 self.config.extend(self.config.pop().maybe_merge(other_config)) 

396 

397 def __eq__(self, other: object) -> bool: 

398 if not isinstance(other, TaskIR): 

399 return False 

400 elif all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")): 

401 return True 

402 else: 

403 return False 

404 

405 

406@dataclass 

407class ImportIR: 

408 """An intermediate representation of imported pipelines""" 

409 

410 location: str 

411 """This is the location of the pipeline to inherit. The path should be 

412 specified as an absolute path. Environment variables may be used in the 

413 path and should be specified as a python string template, with the name of 

414 the environment variable inside braces. 

415 """ 

416 include: Union[List[str], None] = None 

417 """List of tasks that should be included when inheriting this pipeline. 

418 Either the include or exclude attributes may be specified, but not both. 

419 """ 

420 exclude: Union[List[str], None] = None 

421 """List of tasks that should be excluded when inheriting this pipeline. 

422 Either the include or exclude attributes may be specified, but not both. 

423 """ 

424 importContracts: bool = True 

425 """Boolean attribute to dictate if contracts should be inherited with the 

426 pipeline or not. 

427 """ 

428 instrument: Union[Literal[_Tags.KeepInstrument], str, None] = _Tags.KeepInstrument 

429 """Instrument to assign to the Pipeline at import. The default value of 

430 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

431 declared with will not be modified. Setting this value to None will drop 

432 any declared instrument prior to import. 

433 """ 

434 

435 def toPipelineIR(self) -> "PipelineIR": 

436 """Load in the Pipeline specified by this object, and turn it into a 

437 PipelineIR instance. 

438 

439 Returns 

440 ------- 

441 pipeline : `PipelineIR` 

442 A pipeline generated from the imported pipeline file 

443 """ 

444 if self.include and self.exclude: 

445 raise ValueError( 

446 "Both an include and an exclude list cant be specified when declaring a pipeline import" 

447 ) 

448 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

449 if self.instrument is not _Tags.KeepInstrument: 

450 tmp_pipeline.instrument = self.instrument 

451 

452 included_labels = set() 

453 for label in tmp_pipeline.tasks: 

454 if ( 

455 (self.include and label in self.include) 

456 or (self.exclude and label not in self.exclude) 

457 or (self.include is None and self.exclude is None) 

458 ): 

459 included_labels.add(label) 

460 

461 # Handle labeled subsets being specified in the include or exclude 

462 # list, adding or removing labels. 

463 if self.include is not None: 

464 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

465 for label in subsets_in_include: 

466 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

467 

468 elif self.exclude is not None: 

469 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

470 for label in subsets_in_exclude: 

471 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

472 

473 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels) 

474 

475 if not self.importContracts: 

476 tmp_pipeline.contracts = [] 

477 

478 return tmp_pipeline 

479 

480 def __eq__(self, other: object) -> bool: 

481 if not isinstance(other, ImportIR): 

482 return False 

483 elif all( 

484 getattr(self, attr) == getattr(other, attr) 

485 for attr in ("location", "include", "exclude", "importContracts") 

486 ): 

487 return True 

488 else: 

489 return False 

490 

491 

492class PipelineIR: 

493 """Intermediate representation of a pipeline definition 

494 

495 Parameters 

496 ---------- 

497 loaded_yaml : `dict` 

498 A dictionary which matches the structure that would be produced by a 

499 yaml reader which parses a pipeline definition document 

500 

501 Raises 

502 ------ 

503 ValueError 

504 Raised if: 

505 

506 - a pipeline is declared without a description; 

507 - no tasks are declared in a pipeline, and no pipelines are to be 

508 inherited; 

509 - more than one instrument is specified; 

510 - more than one inherited pipeline share a label. 

511 """ 

512 

513 def __init__(self, loaded_yaml: Dict[str, Any]): 

514 # Check required fields are present 

515 if "description" not in loaded_yaml: 

516 raise ValueError("A pipeline must be declared with a description") 

517 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

518 raise ValueError("A pipeline must be declared with one or more tasks") 

519 

520 # These steps below must happen in this call order 

521 

522 # Process pipeline description 

523 self.description = loaded_yaml.pop("description") 

524 

525 # Process tasks 

526 self._read_tasks(loaded_yaml) 

527 

528 # Process instrument keys 

529 inst = loaded_yaml.pop("instrument", None) 

530 if isinstance(inst, list): 

531 raise ValueError("Only one top level instrument can be defined in a pipeline") 

532 self.instrument: Optional[str] = inst 

533 

534 # Process any contracts 

535 self._read_contracts(loaded_yaml) 

536 

537 # Process any defined parameters 

538 self._read_parameters(loaded_yaml) 

539 

540 # Process any named label subsets 

541 self._read_labeled_subsets(loaded_yaml) 

542 

543 # Process any inherited pipelines 

544 self._read_imports(loaded_yaml) 

545 

546 # verify named subsets, must be done after inheriting 

547 self._verify_labeled_subsets() 

548 

549 def _read_contracts(self, loaded_yaml: Dict[str, Any]) -> None: 

550 """Process the contracts portion of the loaded yaml document 

551 

552 Parameters 

553 --------- 

554 loaded_yaml : `dict` 

555 A dictionary which matches the structure that would be produced by 

556 a yaml reader which parses a pipeline definition document 

557 """ 

558 loaded_contracts = loaded_yaml.pop("contracts", []) 

559 if isinstance(loaded_contracts, str): 

560 loaded_contracts = [loaded_contracts] 

561 self.contracts: List[ContractIR] = [] 

562 for contract in loaded_contracts: 

563 if isinstance(contract, dict): 

564 self.contracts.append(ContractIR(**contract)) 

565 if isinstance(contract, str): 

566 self.contracts.append(ContractIR(contract=contract)) 

567 

568 def _read_parameters(self, loaded_yaml: Dict[str, Any]) -> None: 

569 """Process the parameters portion of the loaded yaml document 

570 

571 Parameters 

572 --------- 

573 loaded_yaml : `dict` 

574 A dictionary which matches the structure that would be produced by 

575 a yaml reader which parses a pipeline definition document 

576 """ 

577 loaded_parameters = loaded_yaml.pop("parameters", {}) 

578 if not isinstance(loaded_parameters, dict): 

579 raise ValueError("The parameters section must be a yaml mapping") 

580 self.parameters = ParametersIR(loaded_parameters) 

581 

582 def _read_labeled_subsets(self, loaded_yaml: Dict[str, Any]) -> None: 

583 """Process the subsets portion of the loaded yaml document 

584 

585 Parameters 

586 ---------- 

587 loaded_yaml: `MutableMapping` 

588 A dictionary which matches the structure that would be produced 

589 by a yaml reader which parses a pipeline definition document 

590 """ 

591 loaded_subsets = loaded_yaml.pop("subsets", {}) 

592 self.labeled_subsets: Dict[str, LabeledSubset] = {} 

593 if not loaded_subsets and "subset" in loaded_yaml: 

594 raise ValueError("Top level key should be subsets and not subset, add an s") 

595 for key, value in loaded_subsets.items(): 

596 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

597 

598 def _verify_labeled_subsets(self) -> None: 

599 """Verifies that all the labels in each named subset exist within the 

600 pipeline. 

601 """ 

602 # Verify that all labels defined in a labeled subset are in the 

603 # Pipeline 

604 for labeled_subset in self.labeled_subsets.values(): 

605 if not labeled_subset.subset.issubset(self.tasks.keys()): 

606 raise ValueError( 

607 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

608 "declared pipeline" 

609 ) 

610 # Verify subset labels are not already task labels 

611 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

612 if label_intersection: 

613 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

614 

615 def _read_imports(self, loaded_yaml: Dict[str, Any]) -> None: 

616 """Process the inherits portion of the loaded yaml document 

617 

618 Parameters 

619 --------- 

620 loaded_yaml : `dict` 

621 A dictionary which matches the structure that would be produced by 

622 a yaml reader which parses a pipeline definition document 

623 """ 

624 

625 def process_args(argument: Union[str, dict]) -> dict: 

626 if isinstance(argument, str): 

627 return {"location": argument} 

628 elif isinstance(argument, dict): 

629 if "exclude" in argument and isinstance(argument["exclude"], str): 

630 argument["exclude"] = [argument["exclude"]] 

631 if "include" in argument and isinstance(argument["include"], str): 

632 argument["include"] = [argument["include"]] 

633 if "instrument" in argument and argument["instrument"] == "None": 

634 argument["instrument"] = None 

635 return argument 

636 

637 if not {"inherits", "imports"} - loaded_yaml.keys(): 

638 raise ValueError("Cannot define both inherits and imports sections, use imports") 

639 tmp_import = loaded_yaml.pop("inherits", None) 

640 if tmp_import is None: 

641 tmp_import = loaded_yaml.pop("imports", None) 

642 else: 

643 warnings.warn( 

644 "The 'inherits' key is deprecated, and will be " 

645 "removed around June 2021. Please use the key " 

646 "'imports' instead" 

647 ) 

648 if tmp_import is None: 

649 self.imports: List[ImportIR] = [] 

650 elif isinstance(tmp_import, list): 

651 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

652 else: 

653 self.imports = [ImportIR(**process_args(tmp_import))] 

654 

655 # integrate any imported pipelines 

656 accumulate_tasks: Dict[str, TaskIR] = {} 

657 accumulate_labeled_subsets: Dict[str, LabeledSubset] = {} 

658 accumulated_parameters = ParametersIR({}) 

659 for other_pipeline in self.imports: 

660 tmp_IR = other_pipeline.toPipelineIR() 

661 if self.instrument is None: 

662 self.instrument = tmp_IR.instrument 

663 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

664 msg = ( 

665 "Only one instrument can be declared in a pipeline or its imports. " 

666 f"Top level pipeline defines {self.instrument} but {other_pipeline.location} " 

667 f"defines {tmp_IR.instrument}." 

668 ) 

669 raise ValueError(msg) 

670 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

671 msg = ( 

672 "Task labels in the imported pipelines must be unique. " 

673 f"These labels appear multiple times: {duplicate_labels}" 

674 ) 

675 raise ValueError(msg) 

676 accumulate_tasks.update(tmp_IR.tasks) 

677 self.contracts.extend(tmp_IR.contracts) 

678 # verify that tmp_IR has unique labels for named subset among 

679 # existing labeled subsets, and with existing task labels. 

680 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

681 task_subset_overlap = ( 

682 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

683 ) & accumulate_tasks.keys() 

684 if overlapping_subsets or task_subset_overlap: 

685 raise ValueError( 

686 "Labeled subset names must be unique amongst imports in both labels and " 

687 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

688 ) 

689 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

690 accumulated_parameters.update(tmp_IR.parameters) 

691 

692 # verify that any accumulated labeled subsets dont clash with a label 

693 # from this pipeline 

694 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

695 raise ValueError( 

696 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

697 ) 

698 # merge in the named subsets for self so this document can override any 

699 # that have been delcared 

700 accumulate_labeled_subsets.update(self.labeled_subsets) 

701 self.labeled_subsets = accumulate_labeled_subsets 

702 

703 # merge the dict of label:TaskIR objects, preserving any configs in the 

704 # imported pipeline if the labels point to the same class 

705 for label, task in self.tasks.items(): 

706 if label not in accumulate_tasks: 

707 accumulate_tasks[label] = task 

708 elif accumulate_tasks[label].klass == task.klass: 

709 if task.config is not None: 

710 for config in task.config: 

711 accumulate_tasks[label].add_or_update_config(config) 

712 else: 

713 accumulate_tasks[label] = task 

714 self.tasks: Dict[str, TaskIR] = accumulate_tasks 

715 accumulated_parameters.update(self.parameters) 

716 self.parameters = accumulated_parameters 

717 

718 def _read_tasks(self, loaded_yaml: Dict[str, Any]) -> None: 

719 """Process the tasks portion of the loaded yaml document 

720 

721 Parameters 

722 --------- 

723 loaded_yaml : `dict` 

724 A dictionary which matches the structure that would be produced by 

725 a yaml reader which parses a pipeline definition document 

726 """ 

727 self.tasks = {} 

728 tmp_tasks = loaded_yaml.pop("tasks", None) 

729 if tmp_tasks is None: 

730 tmp_tasks = {} 

731 

732 if "parameters" in tmp_tasks: 

733 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

734 

735 for label, definition in tmp_tasks.items(): 

736 if isinstance(definition, str): 

737 definition = {"class": definition} 

738 config = definition.get("config", None) 

739 if config is None: 

740 task_config_ir = None 

741 else: 

742 if isinstance(config, dict): 

743 config = [config] 

744 task_config_ir = [] 

745 for c in config: 

746 file = c.pop("file", None) 

747 if file is None: 

748 file = [] 

749 elif not isinstance(file, list): 

750 file = [file] 

751 task_config_ir.append( 

752 ConfigIR( 

753 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

754 ) 

755 ) 

756 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

757 

758 def _remove_contracts(self, label: str) -> None: 

759 """Remove any contracts that contain the given label 

760 

761 String comparison used in this way is not the most elegant and may 

762 have issues, but it is the only feasible way when users can specify 

763 contracts with generic strings. 

764 """ 

765 new_contracts = [] 

766 for contract in self.contracts: 

767 # match a label that is not preceded by an ASCII identifier, or 

768 # is the start of a line and is followed by a dot 

769 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

770 continue 

771 new_contracts.append(contract) 

772 self.contracts = new_contracts 

773 

774 def subset_from_labels(self, labelSpecifier: Set[str]) -> PipelineIR: 

775 """Subset a pipelineIR to contain only labels specified in 

776 labelSpecifier. 

777 

778 Parameters 

779 ---------- 

780 labelSpecifier : `set` of `str` 

781 Set containing labels that describes how to subset a pipeline. 

782 

783 Returns 

784 ------- 

785 pipeline : `PipelineIR` 

786 A new pipelineIR object that is a subset of the old pipelineIR 

787 

788 Raises 

789 ------ 

790 ValueError 

791 Raised if there is an issue with specified labels 

792 

793 Notes 

794 ----- 

795 This method attempts to prune any contracts that contain labels which 

796 are not in the declared subset of labels. This pruning is done using a 

797 string based matching due to the nature of contracts and may prune more 

798 than it should. Any labeled subsets defined that no longer have all 

799 members of the subset present in the pipeline will be removed from the 

800 resulting pipeline. 

801 """ 

802 

803 pipeline = copy.deepcopy(self) 

804 

805 # update the label specifier to expand any named subsets 

806 toRemove = set() 

807 toAdd = set() 

808 for label in labelSpecifier: 

809 if label in pipeline.labeled_subsets: 

810 toRemove.add(label) 

811 toAdd.update(pipeline.labeled_subsets[label].subset) 

812 labelSpecifier.difference_update(toRemove) 

813 labelSpecifier.update(toAdd) 

814 # verify all the labels are in the pipeline 

815 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

816 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

817 raise ValueError( 

818 "Not all supplied labels (specified or named subsets) are in the pipeline " 

819 f"definition, extra labels: {difference}" 

820 ) 

821 # copy needed so as to not modify while iterating 

822 pipeline_labels = set(pipeline.tasks.keys()) 

823 # Remove the labels from the pipelineIR, and any contracts that contain 

824 # those labels (see docstring on _remove_contracts for why this may 

825 # cause issues) 

826 for label in pipeline_labels: 

827 if label not in labelSpecifier: 

828 pipeline.tasks.pop(label) 

829 pipeline._remove_contracts(label) 

830 

831 # create a copy of the object to iterate over 

832 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

833 # remove any labeled subsets that no longer have a complete set 

834 for label, labeled_subset in labeled_subsets.items(): 

835 if labeled_subset.subset - pipeline.tasks.keys(): 

836 pipeline.labeled_subsets.pop(label) 

837 

838 return pipeline 

839 

840 @classmethod 

841 def from_string(cls, pipeline_string: str) -> PipelineIR: 

842 """Create a `PipelineIR` object from a string formatted like a pipeline 

843 document 

844 

845 Parameters 

846 ---------- 

847 pipeline_string : `str` 

848 A string that is formatted according like a pipeline document 

849 """ 

850 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

851 return cls(loaded_yaml) 

852 

853 @classmethod 

854 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

855 """Create a `PipelineIR` object from the document specified by the 

856 input uri. 

857 

858 Parameters 

859 ---------- 

860 uri: convertible to `ResourcePath` 

861 Location of document to use in creating a `PipelineIR` object. 

862 

863 Returns 

864 ------- 

865 pipelineIR : `PipelineIR` 

866 The loaded pipeline 

867 """ 

868 loaded_uri = ResourcePath(uri) 

869 with loaded_uri.open("r") as buffer: 

870 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader) 

871 return cls(loaded_yaml) 

872 

873 def write_to_uri( 

874 self, 

875 uri: ResourcePathExpression, 

876 ) -> None: 

877 """Serialize this `PipelineIR` object into a yaml formatted string and 

878 write the output to a file at the specified uri. 

879 

880 Parameters 

881 ---------- 

882 uri: convertible to `ResourcePath` 

883 Location of document to write a `PipelineIR` object. 

884 """ 

885 with ResourcePath(uri).open("w") as buffer: 

886 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

887 

888 def to_primitives(self) -> Dict[str, Any]: 

889 """Convert to a representation used in yaml serialization""" 

890 accumulate = {"description": self.description} 

891 if self.instrument is not None: 

892 accumulate["instrument"] = self.instrument 

893 if self.parameters: 

894 accumulate["parameters"] = self._sort_by_str(self.parameters.to_primitives()) 

895 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

896 if len(self.contracts) > 0: 

897 # sort contracts lexicographical order by the contract string in 

898 # absence of any other ordering principle 

899 contracts_list = [c.to_primitives() for c in self.contracts] 

900 contracts_list.sort(key=lambda x: x["contract"]) 

901 accumulate["contracts"] = contracts_list 

902 if self.labeled_subsets: 

903 accumulate["subsets"] = self._sort_by_str( 

904 {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

905 ) 

906 return accumulate 

907 

908 def reorder_tasks(self, task_labels: List[str]) -> None: 

909 """Changes the order tasks are stored internally. Useful for 

910 determining the order things will appear in the serialized (or printed) 

911 form. 

912 

913 Parameters 

914 ---------- 

915 task_labels : `list` of `str` 

916 A list corresponding to all the labels in the pipeline inserted in 

917 the order the tasks are to be stored. 

918 

919 Raises 

920 ------ 

921 KeyError 

922 Raised if labels are supplied that are not in the pipeline, or if 

923 not all labels in the pipeline were supplied in task_labels input. 

924 """ 

925 # verify that all labels are in the input 

926 _tmp_set = set(task_labels) 

927 if remainder := (self.tasks.keys() - _tmp_set): 

928 raise KeyError(f"Label(s) {remainder} are missing from the task label list") 

929 if extra := (_tmp_set - self.tasks.keys()): 

930 raise KeyError(f"Extra label(s) {extra} were in the input and are not in the pipeline") 

931 

932 newTasks = {key: self.tasks[key] for key in task_labels} 

933 self.tasks = newTasks 

934 

935 @staticmethod 

936 def _sort_by_str(arg: Mapping[str, Any]) -> Mapping[str, Any]: 

937 keys = sorted(arg.keys()) 

938 return {key: arg[key] for key in keys} 

939 

940 def __str__(self) -> str: 

941 """Instance formatting as how it would look in yaml representation""" 

942 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

943 

944 def __repr__(self) -> str: 

945 """Instance formatting as how it would look in yaml representation""" 

946 return str(self) 

947 

948 def __eq__(self, other: object) -> bool: 

949 if not isinstance(other, PipelineIR): 

950 return False 

951 # special case contracts because it is a list, but order is not 

952 # important 

953 elif ( 

954 all( 

955 getattr(self, attr) == getattr(other, attr) 

956 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

957 ) 

958 and len(self.contracts) == len(other.contracts) 

959 and all(c in self.contracts for c in other.contracts) 

960 ): 

961 return True 

962 else: 

963 return False