Coverage for python/lsst/pipe/base/pipelineIR.py: 21%

415 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-18 11:52 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("ConfigIR", "ContractError", "ContractIR", "ImportIR", "PipelineIR", "TaskIR", "LabeledSubset") 

24 

25import copy 

26import enum 

27import os 

28import re 

29import warnings 

30from collections import Counter 

31from collections.abc import Iterable as abcIterable 

32from dataclasses import dataclass, field 

33from typing import Any, Dict, Generator, List, Literal, Mapping, MutableMapping, Optional, Set, Union 

34 

35import yaml 

36from deprecated.sphinx import deprecated 

37from lsst.resources import ResourcePath, ResourcePathExpression 

38 

39 

40class _Tags(enum.Enum): 

41 KeepInstrument = enum.auto() 

42 

43 

44class PipelineYamlLoader(yaml.SafeLoader): 

45 """This is a specialized version of yaml's SafeLoader. It checks and raises 

46 an exception if it finds that there are multiple instances of the same key 

47 found inside a pipeline file at a given scope. 

48 """ 

49 

50 def construct_mapping(self, node: yaml.Node, deep: bool = False) -> Mapping[str, Any]: 

51 # do the call to super first so that it can do all the other forms of 

52 # checking on this node. If you check the uniqueness of keys first 

53 # it would save the work that super does in the case of a failure, but 

54 # it might fail in the case that the node was the incorrect node due 

55 # to a parsing error, and the resulting exception would be difficult to 

56 # understand. 

57 mapping = super().construct_mapping(node, deep) 

58 # Check if there are any duplicate keys 

59 all_keys = Counter(key_node.value for key_node, _ in node.value) 

60 duplicates = {k for k, i in all_keys.items() if i != 1} 

61 if duplicates: 

62 raise KeyError( 

63 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

64 ) 

65 return mapping 

66 

67 

68class MultilineStringDumper(yaml.Dumper): 

69 """Custom YAML dumper that makes multi-line strings use the '|' 

70 continuation style instead of unreadable newlines and tons of quotes. 

71 

72 Basic approach is taken from 

73 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

74 but is written as a Dumper subclass to make its effects non-global (vs 

75 `yaml.add_representer`). 

76 """ 

77 

78 def represent_scalar(self, tag: str, value: Any, style: Optional[str] = None) -> yaml.ScalarNode: 

79 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

80 style = "|" 

81 return super().represent_scalar(tag, value, style) 

82 

83 

84class ContractError(Exception): 

85 """An exception that is raised when a pipeline contract is not satisfied""" 

86 

87 pass 

88 

89 

90@dataclass 

91class ContractIR: 

92 """Intermediate representation of configuration contracts read from a 

93 pipeline yaml file.""" 

94 

95 contract: str 

96 """A string of python code representing one or more conditions on configs 

97 in a pipeline. This code-as-string should, once evaluated, should be True 

98 if the configs are fine, and False otherwise. 

99 """ 

100 msg: Union[str, None] = None 

101 """An optional message to be shown to the user if a contract fails 

102 """ 

103 

104 def to_primitives(self) -> Dict[str, str]: 

105 """Convert to a representation used in yaml serialization""" 

106 accumulate = {"contract": self.contract} 

107 if self.msg is not None: 

108 accumulate["msg"] = self.msg 

109 return accumulate 

110 

111 def __eq__(self, other: object) -> bool: 

112 if not isinstance(other, ContractIR): 

113 return False 

114 elif self.contract == other.contract and self.msg == other.msg: 

115 return True 

116 else: 

117 return False 

118 

119 

120@dataclass 

121class LabeledSubset: 

122 """Intermediate representation of named subset of task labels read from 

123 a pipeline yaml file. 

124 """ 

125 

126 label: str 

127 """The label used to identify the subset of task labels. 

128 """ 

129 subset: Set[str] 

130 """A set of task labels contained in this subset. 

131 """ 

132 description: Optional[str] 

133 """A description of what this subset of tasks is intended to do 

134 """ 

135 

136 @staticmethod 

137 def from_primitives(label: str, value: Union[List[str], dict]) -> LabeledSubset: 

138 """Generate `LabeledSubset` objects given a properly formatted object 

139 that as been created by a yaml loader. 

140 

141 Parameters 

142 ---------- 

143 label : `str` 

144 The label that will be used to identify this labeled subset. 

145 value : `list` of `str` or `dict` 

146 Object returned from loading a labeled subset section from a yaml 

147 document. 

148 

149 Returns 

150 ------- 

151 labeledSubset : `LabeledSubset` 

152 A `LabeledSubset` object build from the inputs. 

153 

154 Raises 

155 ------ 

156 ValueError 

157 Raised if the value input is not properly formatted for parsing 

158 """ 

159 if isinstance(value, MutableMapping): 

160 subset = value.pop("subset", None) 

161 if subset is None: 

162 raise ValueError( 

163 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

164 ) 

165 description = value.pop("description", None) 

166 elif isinstance(value, abcIterable): 

167 subset = value 

168 description = None 

169 else: 

170 raise ValueError( 

171 f"There was a problem parsing the labeled subset {label}, make sure the " 

172 "definition is either a valid yaml list, or a mapping with keys " 

173 "(subset, description) where subset points to a yaml list, and description is " 

174 "associated with a string" 

175 ) 

176 return LabeledSubset(label, set(subset), description) 

177 

178 def to_primitives(self) -> Dict[str, Union[List[str], str]]: 

179 """Convert to a representation used in yaml serialization""" 

180 accumulate: Dict[str, Union[List[str], str]] = {"subset": list(self.subset)} 

181 if self.description is not None: 

182 accumulate["description"] = self.description 

183 return accumulate 

184 

185 

186@dataclass 

187class ParametersIR: 

188 """Intermediate representation of parameters that are global to a pipeline 

189 

190 These parameters are specified under a top level key named `parameters` 

191 and are declared as a yaml mapping. These entries can then be used inside 

192 task configuration blocks to specify configuration values. They may not be 

193 used in the special ``file`` or ``python`` blocks. 

194 

195 Example: 

196 paramters: 

197 shared_value: 14 

198 tasks: 

199 taskA: 

200 class: modA 

201 config: 

202 field1: parameters.shared_value 

203 taskB: 

204 class: modB 

205 config: 

206 field2: parameters.shared_value 

207 """ 

208 

209 mapping: MutableMapping[str, str] 

210 """A mutable mapping of identifiers as keys, and shared configuration 

211 as values. 

212 """ 

213 

214 def update(self, other: Optional[ParametersIR]) -> None: 

215 if other is not None: 

216 self.mapping.update(other.mapping) 

217 

218 def to_primitives(self) -> MutableMapping[str, str]: 

219 """Convert to a representation used in yaml serialization""" 

220 return self.mapping 

221 

222 def __contains__(self, value: str) -> bool: 

223 return value in self.mapping 

224 

225 def __getitem__(self, item: str) -> Any: 

226 return self.mapping[item] 

227 

228 def __bool__(self) -> bool: 

229 return bool(self.mapping) 

230 

231 

232@dataclass 

233class ConfigIR: 

234 """Intermediate representation of configurations read from a pipeline yaml 

235 file. 

236 """ 

237 

238 python: Union[str, None] = None 

239 """A string of python code that is used to modify a configuration. This can 

240 also be None if there are no modifications to do. 

241 """ 

242 dataId: Union[dict, None] = None 

243 """A dataId that is used to constrain these config overrides to only quanta 

244 with matching dataIds. This field can be None if there is no constraint. 

245 This is currently an unimplemented feature, and is placed here for future 

246 use. 

247 """ 

248 file: List[str] = field(default_factory=list) 

249 """A list of paths which points to a file containing config overrides to be 

250 applied. This value may be an empty list if there are no overrides to 

251 apply. 

252 """ 

253 rest: dict = field(default_factory=dict) 

254 """This is a dictionary of key value pairs, where the keys are strings 

255 corresponding to qualified fields on a config to override, and the values 

256 are strings representing the values to apply. 

257 """ 

258 

259 def to_primitives(self) -> Dict[str, Union[str, dict, List[str]]]: 

260 """Convert to a representation used in yaml serialization""" 

261 accumulate = {} 

262 for name in ("python", "dataId", "file"): 

263 # if this attribute is thruthy add it to the accumulation 

264 # dictionary 

265 if getattr(self, name): 

266 accumulate[name] = getattr(self, name) 

267 # Add the dictionary containing the rest of the config keys to the 

268 # # accumulated dictionary 

269 accumulate.update(self.rest) 

270 return accumulate 

271 

272 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

273 """Returns a new ConfigIR object that is formatted according to the 

274 specified parameters 

275 

276 Parameters 

277 ---------- 

278 parameters : ParametersIR 

279 Object that contains variable mappings used in substitution. 

280 

281 Returns 

282 ------- 

283 config : ConfigIR 

284 A new ConfigIR object formatted with the input parameters 

285 """ 

286 new_config = copy.deepcopy(self) 

287 for key, value in new_config.rest.items(): 

288 if not isinstance(value, str): 

289 continue 

290 match = re.match("parameters[.](.*)", value) 

291 if match and match.group(1) in parameters: 

292 new_config.rest[key] = parameters[match.group(1)] 

293 if match and match.group(1) not in parameters: 

294 warnings.warn( 

295 f"config {key} contains value {match.group(0)} which is formatted like a " 

296 "Pipeline parameter but was not found within the Pipeline, if this was not " 

297 "intentional, check for a typo" 

298 ) 

299 return new_config 

300 

301 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

302 """Merges another instance of a `ConfigIR` into this instance if 

303 possible. This function returns a generator that is either self 

304 if the configs were merged, or self, and other_config if that could 

305 not be merged. 

306 

307 Parameters 

308 ---------- 

309 other_config : `ConfigIR` 

310 An instance of `ConfigIR` to merge into this instance. 

311 

312 Returns 

313 ------- 

314 Generator : `ConfigIR` 

315 A generator containing either self, or self and other_config if 

316 the configs could be merged or not respectively. 

317 """ 

318 # Verify that the config blocks can be merged 

319 if ( 

320 self.dataId != other_config.dataId 

321 or self.python 

322 or other_config.python 

323 or self.file 

324 or other_config.file 

325 ): 

326 yield from (self, other_config) 

327 return 

328 

329 # create a set of all keys, and verify two keys do not have different 

330 # values 

331 key_union = self.rest.keys() & other_config.rest.keys() 

332 for key in key_union: 

333 if self.rest[key] != other_config.rest[key]: 

334 yield from (self, other_config) 

335 return 

336 self.rest.update(other_config.rest) 

337 

338 # Combine the lists of override files to load 

339 self_file_set = set(self.file) 

340 other_file_set = set(other_config.file) 

341 self.file = list(self_file_set.union(other_file_set)) 

342 

343 yield self 

344 

345 def __eq__(self, other: object) -> bool: 

346 if not isinstance(other, ConfigIR): 

347 return False 

348 elif all( 

349 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

350 ): 

351 return True 

352 else: 

353 return False 

354 

355 

356@dataclass 

357class TaskIR: 

358 """Intermediate representation of tasks read from a pipeline yaml file.""" 

359 

360 label: str 

361 """An identifier used to refer to a task. 

362 """ 

363 klass: str 

364 """A string containing a fully qualified python class to be run in a 

365 pipeline. 

366 """ 

367 config: Union[List[ConfigIR], None] = None 

368 """List of all configs overrides associated with this task, and may be 

369 `None` if there are no config overrides. 

370 """ 

371 

372 def to_primitives(self) -> Dict[str, Union[str, List[dict]]]: 

373 """Convert to a representation used in yaml serialization""" 

374 accumulate: Dict[str, Union[str, List[dict]]] = {"class": self.klass} 

375 if self.config: 

376 accumulate["config"] = [c.to_primitives() for c in self.config] 

377 return accumulate 

378 

379 def add_or_update_config(self, other_config: ConfigIR) -> None: 

380 """Adds a `ConfigIR` to this task if one is not present. Merges configs 

381 if there is a `ConfigIR` present and the dataId keys of both configs 

382 match, otherwise adds a new entry to the config list. The exception to 

383 the above is that if either the last config or other_config has a 

384 python block, then other_config is always added, as python blocks can 

385 modify configs in ways that cannot be predicted. 

386 

387 Parameters 

388 ---------- 

389 other_config : `ConfigIR` 

390 A `ConfigIR` instance to add or merge into the config attribute of 

391 this task. 

392 """ 

393 if not self.config: 

394 self.config = [other_config] 

395 return 

396 self.config.extend(self.config.pop().maybe_merge(other_config)) 

397 

398 def __eq__(self, other: object) -> bool: 

399 if not isinstance(other, TaskIR): 

400 return False 

401 elif all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")): 

402 return True 

403 else: 

404 return False 

405 

406 

407@dataclass 

408class ImportIR: 

409 """An intermediate representation of imported pipelines""" 

410 

411 location: str 

412 """This is the location of the pipeline to inherit. The path should be 

413 specified as an absolute path. Environment variables may be used in the 

414 path and should be specified as a python string template, with the name of 

415 the environment variable inside braces. 

416 """ 

417 include: Union[List[str], None] = None 

418 """List of tasks that should be included when inheriting this pipeline. 

419 Either the include or exclude attributes may be specified, but not both. 

420 """ 

421 exclude: Union[List[str], None] = None 

422 """List of tasks that should be excluded when inheriting this pipeline. 

423 Either the include or exclude attributes may be specified, but not both. 

424 """ 

425 importContracts: bool = True 

426 """Boolean attribute to dictate if contracts should be inherited with the 

427 pipeline or not. 

428 """ 

429 instrument: Union[Literal[_Tags.KeepInstrument], str, None] = _Tags.KeepInstrument 

430 """Instrument to assign to the Pipeline at import. The default value of 

431 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

432 declared with will not be modified. Setting this value to None will drop 

433 any declared instrument prior to import. 

434 """ 

435 

436 def toPipelineIR(self) -> "PipelineIR": 

437 """Load in the Pipeline specified by this object, and turn it into a 

438 PipelineIR instance. 

439 

440 Returns 

441 ------- 

442 pipeline : `PipelineIR` 

443 A pipeline generated from the imported pipeline file 

444 """ 

445 if self.include and self.exclude: 

446 raise ValueError( 

447 "Both an include and an exclude list cant be specified when declaring a pipeline import" 

448 ) 

449 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

450 if self.instrument is not _Tags.KeepInstrument: 

451 tmp_pipeline.instrument = self.instrument 

452 

453 included_labels = set() 

454 for label in tmp_pipeline.tasks: 

455 if ( 

456 (self.include and label in self.include) 

457 or (self.exclude and label not in self.exclude) 

458 or (self.include is None and self.exclude is None) 

459 ): 

460 included_labels.add(label) 

461 

462 # Handle labeled subsets being specified in the include or exclude 

463 # list, adding or removing labels. 

464 if self.include is not None: 

465 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

466 for label in subsets_in_include: 

467 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

468 

469 elif self.exclude is not None: 

470 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

471 for label in subsets_in_exclude: 

472 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

473 

474 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels) 

475 

476 if not self.importContracts: 

477 tmp_pipeline.contracts = [] 

478 

479 return tmp_pipeline 

480 

481 def __eq__(self, other: object) -> bool: 

482 if not isinstance(other, ImportIR): 

483 return False 

484 elif all( 

485 getattr(self, attr) == getattr(other, attr) 

486 for attr in ("location", "include", "exclude", "importContracts") 

487 ): 

488 return True 

489 else: 

490 return False 

491 

492 

493class PipelineIR: 

494 """Intermediate representation of a pipeline definition 

495 

496 Parameters 

497 ---------- 

498 loaded_yaml : `dict` 

499 A dictionary which matches the structure that would be produced by a 

500 yaml reader which parses a pipeline definition document 

501 

502 Raises 

503 ------ 

504 ValueError 

505 Raised if: 

506 - a pipeline is declared without a description; 

507 - no tasks are declared in a pipeline, and no pipelines are to be 

508 inherited; 

509 - more than one instrument is specified; 

510 - more than one inherited pipeline share a label. 

511 """ 

512 

513 def __init__(self, loaded_yaml: Dict[str, Any]): 

514 # Check required fields are present 

515 if "description" not in loaded_yaml: 

516 raise ValueError("A pipeline must be declared with a description") 

517 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

518 raise ValueError("A pipeline must be declared with one or more tasks") 

519 

520 # These steps below must happen in this call order 

521 

522 # Process pipeline description 

523 self.description = loaded_yaml.pop("description") 

524 

525 # Process tasks 

526 self._read_tasks(loaded_yaml) 

527 

528 # Process instrument keys 

529 inst = loaded_yaml.pop("instrument", None) 

530 if isinstance(inst, list): 

531 raise ValueError("Only one top level instrument can be defined in a pipeline") 

532 self.instrument: Optional[str] = inst 

533 

534 # Process any contracts 

535 self._read_contracts(loaded_yaml) 

536 

537 # Process any defined parameters 

538 self._read_parameters(loaded_yaml) 

539 

540 # Process any named label subsets 

541 self._read_labeled_subsets(loaded_yaml) 

542 

543 # Process any inherited pipelines 

544 self._read_imports(loaded_yaml) 

545 

546 # verify named subsets, must be done after inheriting 

547 self._verify_labeled_subsets() 

548 

549 def _read_contracts(self, loaded_yaml: Dict[str, Any]) -> None: 

550 """Process the contracts portion of the loaded yaml document 

551 

552 Parameters 

553 --------- 

554 loaded_yaml : `dict` 

555 A dictionary which matches the structure that would be produced by 

556 a yaml reader which parses a pipeline definition document 

557 """ 

558 loaded_contracts = loaded_yaml.pop("contracts", []) 

559 if isinstance(loaded_contracts, str): 

560 loaded_contracts = [loaded_contracts] 

561 self.contracts: List[ContractIR] = [] 

562 for contract in loaded_contracts: 

563 if isinstance(contract, dict): 

564 self.contracts.append(ContractIR(**contract)) 

565 if isinstance(contract, str): 

566 self.contracts.append(ContractIR(contract=contract)) 

567 

568 def _read_parameters(self, loaded_yaml: Dict[str, Any]) -> None: 

569 """Process the parameters portion of the loaded yaml document 

570 

571 Parameters 

572 --------- 

573 loaded_yaml : `dict` 

574 A dictionary which matches the structure that would be produced by 

575 a yaml reader which parses a pipeline definition document 

576 """ 

577 loaded_parameters = loaded_yaml.pop("parameters", {}) 

578 if not isinstance(loaded_parameters, dict): 

579 raise ValueError("The parameters section must be a yaml mapping") 

580 self.parameters = ParametersIR(loaded_parameters) 

581 

582 def _read_labeled_subsets(self, loaded_yaml: Dict[str, Any]) -> None: 

583 """Process the subsets portion of the loaded yaml document 

584 

585 Parameters 

586 ---------- 

587 loaded_yaml: `MutableMapping` 

588 A dictionary which matches the structure that would be produced 

589 by a yaml reader which parses a pipeline definition document 

590 """ 

591 loaded_subsets = loaded_yaml.pop("subsets", {}) 

592 self.labeled_subsets: Dict[str, LabeledSubset] = {} 

593 if not loaded_subsets and "subset" in loaded_yaml: 

594 raise ValueError("Top level key should be subsets and not subset, add an s") 

595 for key, value in loaded_subsets.items(): 

596 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

597 

598 def _verify_labeled_subsets(self) -> None: 

599 """Verifies that all the labels in each named subset exist within the 

600 pipeline. 

601 """ 

602 # Verify that all labels defined in a labeled subset are in the 

603 # Pipeline 

604 for labeled_subset in self.labeled_subsets.values(): 

605 if not labeled_subset.subset.issubset(self.tasks.keys()): 

606 raise ValueError( 

607 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

608 "declared pipeline" 

609 ) 

610 # Verify subset labels are not already task labels 

611 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

612 if label_intersection: 

613 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

614 

615 def _read_imports(self, loaded_yaml: Dict[str, Any]) -> None: 

616 """Process the inherits portion of the loaded yaml document 

617 

618 Parameters 

619 --------- 

620 loaded_yaml : `dict` 

621 A dictionary which matches the structure that would be produced by 

622 a yaml reader which parses a pipeline definition document 

623 """ 

624 

625 def process_args(argument: Union[str, dict]) -> dict: 

626 if isinstance(argument, str): 

627 return {"location": argument} 

628 elif isinstance(argument, dict): 

629 if "exclude" in argument and isinstance(argument["exclude"], str): 

630 argument["exclude"] = [argument["exclude"]] 

631 if "include" in argument and isinstance(argument["include"], str): 

632 argument["include"] = [argument["include"]] 

633 if "instrument" in argument and argument["instrument"] == "None": 

634 argument["instrument"] = None 

635 return argument 

636 

637 if not {"inherits", "imports"} - loaded_yaml.keys(): 

638 raise ValueError("Cannot define both inherits and imports sections, use imports") 

639 tmp_import = loaded_yaml.pop("inherits", None) 

640 if tmp_import is None: 

641 tmp_import = loaded_yaml.pop("imports", None) 

642 else: 

643 warnings.warn( 

644 "The 'inherits' key is deprecated, and will be " 

645 "removed around June 2021. Please use the key " 

646 "'imports' instead" 

647 ) 

648 if tmp_import is None: 

649 self.imports: List[ImportIR] = [] 

650 elif isinstance(tmp_import, list): 

651 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

652 else: 

653 self.imports = [ImportIR(**process_args(tmp_import))] 

654 

655 # integrate any imported pipelines 

656 accumulate_tasks: Dict[str, TaskIR] = {} 

657 accumulate_labeled_subsets: Dict[str, LabeledSubset] = {} 

658 accumulated_parameters = ParametersIR({}) 

659 for other_pipeline in self.imports: 

660 tmp_IR = other_pipeline.toPipelineIR() 

661 if self.instrument is None: 

662 self.instrument = tmp_IR.instrument 

663 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

664 msg = ( 

665 "Only one instrument can be declared in a pipeline or its imports. " 

666 f"Top level pipeline defines {self.instrument} but {other_pipeline.location} " 

667 f"defines {tmp_IR.instrument}." 

668 ) 

669 raise ValueError(msg) 

670 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

671 msg = ( 

672 "Task labels in the imported pipelines must be unique. " 

673 f"These labels appear multiple times: {duplicate_labels}" 

674 ) 

675 raise ValueError(msg) 

676 accumulate_tasks.update(tmp_IR.tasks) 

677 self.contracts.extend(tmp_IR.contracts) 

678 # verify that tmp_IR has unique labels for named subset among 

679 # existing labeled subsets, and with existing task labels. 

680 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

681 task_subset_overlap = ( 

682 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

683 ) & accumulate_tasks.keys() 

684 if overlapping_subsets or task_subset_overlap: 

685 raise ValueError( 

686 "Labeled subset names must be unique amongst imports in both labels and " 

687 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

688 ) 

689 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

690 accumulated_parameters.update(tmp_IR.parameters) 

691 

692 # verify that any accumulated labeled subsets dont clash with a label 

693 # from this pipeline 

694 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

695 raise ValueError( 

696 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

697 ) 

698 # merge in the named subsets for self so this document can override any 

699 # that have been delcared 

700 accumulate_labeled_subsets.update(self.labeled_subsets) 

701 self.labeled_subsets = accumulate_labeled_subsets 

702 

703 # merge the dict of label:TaskIR objects, preserving any configs in the 

704 # imported pipeline if the labels point to the same class 

705 for label, task in self.tasks.items(): 

706 if label not in accumulate_tasks: 

707 accumulate_tasks[label] = task 

708 elif accumulate_tasks[label].klass == task.klass: 

709 if task.config is not None: 

710 for config in task.config: 

711 accumulate_tasks[label].add_or_update_config(config) 

712 else: 

713 accumulate_tasks[label] = task 

714 self.tasks: Dict[str, TaskIR] = accumulate_tasks 

715 accumulated_parameters.update(self.parameters) 

716 self.parameters = accumulated_parameters 

717 

718 def _read_tasks(self, loaded_yaml: Dict[str, Any]) -> None: 

719 """Process the tasks portion of the loaded yaml document 

720 

721 Parameters 

722 --------- 

723 loaded_yaml : `dict` 

724 A dictionary which matches the structure that would be produced by 

725 a yaml reader which parses a pipeline definition document 

726 """ 

727 self.tasks = {} 

728 tmp_tasks = loaded_yaml.pop("tasks", None) 

729 if tmp_tasks is None: 

730 tmp_tasks = {} 

731 

732 if "parameters" in tmp_tasks: 

733 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

734 

735 for label, definition in tmp_tasks.items(): 

736 if isinstance(definition, str): 

737 definition = {"class": definition} 

738 config = definition.get("config", None) 

739 if config is None: 

740 task_config_ir = None 

741 else: 

742 if isinstance(config, dict): 

743 config = [config] 

744 task_config_ir = [] 

745 for c in config: 

746 file = c.pop("file", None) 

747 if file is None: 

748 file = [] 

749 elif not isinstance(file, list): 

750 file = [file] 

751 task_config_ir.append( 

752 ConfigIR( 

753 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

754 ) 

755 ) 

756 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

757 

758 def _remove_contracts(self, label: str) -> None: 

759 """Remove any contracts that contain the given label 

760 

761 String comparison used in this way is not the most elegant and may 

762 have issues, but it is the only feasible way when users can specify 

763 contracts with generic strings. 

764 """ 

765 new_contracts = [] 

766 for contract in self.contracts: 

767 # match a label that is not preceded by an ASCII identifier, or 

768 # is the start of a line and is followed by a dot 

769 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

770 continue 

771 new_contracts.append(contract) 

772 self.contracts = new_contracts 

773 

774 def subset_from_labels(self, labelSpecifier: Set[str]) -> PipelineIR: 

775 """Subset a pipelineIR to contain only labels specified in 

776 labelSpecifier. 

777 

778 Parameters 

779 ---------- 

780 labelSpecifier : `set` of `str` 

781 Set containing labels that describes how to subset a pipeline. 

782 

783 Returns 

784 ------- 

785 pipeline : `PipelineIR` 

786 A new pipelineIR object that is a subset of the old pipelineIR 

787 

788 Raises 

789 ------ 

790 ValueError 

791 Raised if there is an issue with specified labels 

792 

793 Notes 

794 ----- 

795 This method attempts to prune any contracts that contain labels which 

796 are not in the declared subset of labels. This pruning is done using a 

797 string based matching due to the nature of contracts and may prune more 

798 than it should. Any labeled subsets defined that no longer have all 

799 members of the subset present in the pipeline will be removed from the 

800 resulting pipeline. 

801 """ 

802 

803 pipeline = copy.deepcopy(self) 

804 

805 # update the label specifier to expand any named subsets 

806 toRemove = set() 

807 toAdd = set() 

808 for label in labelSpecifier: 

809 if label in pipeline.labeled_subsets: 

810 toRemove.add(label) 

811 toAdd.update(pipeline.labeled_subsets[label].subset) 

812 labelSpecifier.difference_update(toRemove) 

813 labelSpecifier.update(toAdd) 

814 # verify all the labels are in the pipeline 

815 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

816 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

817 raise ValueError( 

818 "Not all supplied labels (specified or named subsets) are in the pipeline " 

819 f"definition, extra labels: {difference}" 

820 ) 

821 # copy needed so as to not modify while iterating 

822 pipeline_labels = set(pipeline.tasks.keys()) 

823 # Remove the labels from the pipelineIR, and any contracts that contain 

824 # those labels (see docstring on _remove_contracts for why this may 

825 # cause issues) 

826 for label in pipeline_labels: 

827 if label not in labelSpecifier: 

828 pipeline.tasks.pop(label) 

829 pipeline._remove_contracts(label) 

830 

831 # create a copy of the object to iterate over 

832 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

833 # remove any labeled subsets that no longer have a complete set 

834 for label, labeled_subset in labeled_subsets.items(): 

835 if labeled_subset.subset - pipeline.tasks.keys(): 

836 pipeline.labeled_subsets.pop(label) 

837 

838 return pipeline 

839 

840 @classmethod 

841 def from_string(cls, pipeline_string: str) -> PipelineIR: 

842 """Create a `PipelineIR` object from a string formatted like a pipeline 

843 document 

844 

845 Parameters 

846 ---------- 

847 pipeline_string : `str` 

848 A string that is formatted according like a pipeline document 

849 """ 

850 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

851 return cls(loaded_yaml) 

852 

853 @classmethod 

854 @deprecated( 

855 reason="This has been replaced with `from_uri`. will be removed after v23", 

856 version="v21.0,", 

857 category=FutureWarning, 

858 ) 

859 def from_file(cls, filename: str) -> PipelineIR: 

860 """Create a `PipelineIR` object from the document specified by the 

861 input path. 

862 

863 Parameters 

864 ---------- 

865 filename : `str` 

866 Location of document to use in creating a `PipelineIR` object. 

867 

868 Returns 

869 ------- 

870 pipelineIR : `PipelineIR` 

871 The loaded pipeline 

872 

873 Notes 

874 ----- 

875 This method is deprecated, please use from_uri 

876 """ 

877 return cls.from_uri(filename) 

878 

879 @classmethod 

880 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

881 """Create a `PipelineIR` object from the document specified by the 

882 input uri. 

883 

884 Parameters 

885 ---------- 

886 uri: convertible to `ResourcePath` 

887 Location of document to use in creating a `PipelineIR` object. 

888 

889 Returns 

890 ------- 

891 pipelineIR : `PipelineIR` 

892 The loaded pipeline 

893 """ 

894 loaded_uri = ResourcePath(uri) 

895 with loaded_uri.open("r") as buffer: 

896 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader) 

897 return cls(loaded_yaml) 

898 

899 @deprecated( 

900 reason="This has been replaced with `write_to_uri`. will be removed after v23", 

901 version="v21.0,", 

902 category=FutureWarning, 

903 ) # type: ignore 

904 def to_file(self, filename: str): 

905 """Serialize this `PipelineIR` object into a yaml formatted string and 

906 write the output to a file at the specified path. 

907 

908 Parameters 

909 ---------- 

910 filename : `str` 

911 Location of document to write a `PipelineIR` object. 

912 """ 

913 self.write_to_uri(filename) 

914 

915 def write_to_uri( 

916 self, 

917 uri: ResourcePathExpression, 

918 ) -> None: 

919 """Serialize this `PipelineIR` object into a yaml formatted string and 

920 write the output to a file at the specified uri. 

921 

922 Parameters 

923 ---------- 

924 uri: convertible to `ResourcePath` 

925 Location of document to write a `PipelineIR` object. 

926 """ 

927 with ResourcePath(uri).open("w") as buffer: 

928 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

929 

930 def to_primitives(self) -> Dict[str, Any]: 

931 """Convert to a representation used in yaml serialization""" 

932 accumulate = {"description": self.description} 

933 if self.instrument is not None: 

934 accumulate["instrument"] = self.instrument 

935 if self.parameters: 

936 accumulate["parameters"] = self._sort_by_str(self.parameters.to_primitives()) 

937 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

938 if len(self.contracts) > 0: 

939 # sort contracts lexicographical order by the contract string in 

940 # absence of any other ordering principle 

941 contracts_list = [c.to_primitives() for c in self.contracts] 

942 contracts_list.sort(key=lambda x: x["contract"]) 

943 accumulate["contracts"] = contracts_list 

944 if self.labeled_subsets: 

945 accumulate["subsets"] = self._sort_by_str( 

946 {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

947 ) 

948 return accumulate 

949 

950 def reorder_tasks(self, task_labels: List[str]) -> None: 

951 """Changes the order tasks are stored internally. Useful for 

952 determining the order things will appear in the serialized (or printed) 

953 form. 

954 

955 Parameters 

956 ---------- 

957 task_labels : `list` of `str` 

958 A list corresponding to all the labels in the pipeline inserted in 

959 the order the tasks are to be stored. 

960 

961 Raises 

962 ------ 

963 KeyError 

964 Raised if labels are supplied that are not in the pipeline, or if 

965 not all labels in the pipeline were supplied in task_labels input. 

966 """ 

967 # verify that all labels are in the input 

968 _tmp_set = set(task_labels) 

969 if remainder := (self.tasks.keys() - _tmp_set): 

970 raise KeyError(f"Label(s) {remainder} are missing from the task label list") 

971 if extra := (_tmp_set - self.tasks.keys()): 

972 raise KeyError(f"Extra label(s) {extra} were in the input and are not in the pipeline") 

973 

974 newTasks = {key: self.tasks[key] for key in task_labels} 

975 self.tasks = newTasks 

976 

977 @staticmethod 

978 def _sort_by_str(arg: Mapping[str, Any]) -> Mapping[str, Any]: 

979 keys = sorted(arg.keys()) 

980 return {key: arg[key] for key in keys} 

981 

982 def __str__(self) -> str: 

983 """Instance formatting as how it would look in yaml representation""" 

984 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

985 

986 def __repr__(self) -> str: 

987 """Instance formatting as how it would look in yaml representation""" 

988 return str(self) 

989 

990 def __eq__(self, other: object) -> bool: 

991 if not isinstance(other, PipelineIR): 

992 return False 

993 # special case contracts because it is a list, but order is not 

994 # important 

995 elif ( 

996 all( 

997 getattr(self, attr) == getattr(other, attr) 

998 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

999 ) 

1000 and len(self.contracts) == len(other.contracts) 

1001 and all(c in self.contracts for c in other.contracts) 

1002 ): 

1003 return True 

1004 else: 

1005 return False