Coverage for python/lsst/pipe/base/pipelineIR.py: 19%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

397 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("ConfigIR", "ContractError", "ContractIR", "ImportIR", "PipelineIR", "TaskIR", "LabeledSubset") 

24 

25import copy 

26import enum 

27import os 

28import re 

29import warnings 

30from collections import Counter 

31from collections.abc import Iterable as abcIterable 

32from dataclasses import dataclass, field 

33from typing import Any, Dict, Generator, List, Literal, Mapping, MutableMapping, Optional, Set, Union 

34 

35import yaml 

36from deprecated.sphinx import deprecated 

37from lsst.resources import ResourcePath, ResourcePathExpression 

38 

39 

40class _Tags(enum.Enum): 

41 KeepInstrument = enum.auto() 

42 

43 

44class PipelineYamlLoader(yaml.SafeLoader): 

45 """This is a specialized version of yaml's SafeLoader. It checks and raises 

46 an exception if it finds that there are multiple instances of the same key 

47 found inside a pipeline file at a given scope. 

48 """ 

49 

50 def construct_mapping(self, node: yaml.Node, deep: bool = False) -> Mapping[str, Any]: 

51 # do the call to super first so that it can do all the other forms of 

52 # checking on this node. If you check the uniqueness of keys first 

53 # it would save the work that super does in the case of a failure, but 

54 # it might fail in the case that the node was the incorrect node due 

55 # to a parsing error, and the resulting exception would be difficult to 

56 # understand. 

57 mapping = super().construct_mapping(node, deep) 

58 # Check if there are any duplicate keys 

59 all_keys = Counter(key_node.value for key_node, _ in node.value) 

60 duplicates = {k for k, i in all_keys.items() if i != 1} 

61 if duplicates: 

62 raise KeyError( 

63 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

64 ) 

65 return mapping 

66 

67 

68class MultilineStringDumper(yaml.Dumper): 

69 """Custom YAML dumper that makes multi-line strings use the '|' 

70 continuation style instead of unreadable newlines and tons of quotes. 

71 

72 Basic approach is taken from 

73 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

74 but is written as a Dumper subclass to make its effects non-global (vs 

75 `yaml.add_representer`). 

76 """ 

77 

78 def represent_scalar(self, tag: str, value: Any, style: Optional[str] = None) -> yaml.ScalarNode: 

79 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

80 style = "|" 

81 return super().represent_scalar(tag, value, style) 

82 

83 

84class ContractError(Exception): 

85 """An exception that is raised when a pipeline contract is not satisfied""" 

86 

87 pass 

88 

89 

90@dataclass 

91class ContractIR: 

92 """Intermediate representation of configuration contracts read from a 

93 pipeline yaml file.""" 

94 

95 contract: str 

96 """A string of python code representing one or more conditions on configs 

97 in a pipeline. This code-as-string should, once evaluated, should be True 

98 if the configs are fine, and False otherwise. 

99 """ 

100 msg: Union[str, None] = None 

101 """An optional message to be shown to the user if a contract fails 

102 """ 

103 

104 def to_primitives(self) -> Dict[str, str]: 

105 """Convert to a representation used in yaml serialization""" 

106 accumulate = {"contract": self.contract} 

107 if self.msg is not None: 

108 accumulate["msg"] = self.msg 

109 return accumulate 

110 

111 def __eq__(self, other: object) -> bool: 

112 if not isinstance(other, ContractIR): 

113 return False 

114 elif self.contract == other.contract and self.msg == other.msg: 

115 return True 

116 else: 

117 return False 

118 

119 

120@dataclass 

121class LabeledSubset: 

122 """Intermediate representation of named subset of task labels read from 

123 a pipeline yaml file. 

124 """ 

125 

126 label: str 

127 """The label used to identify the subset of task labels. 

128 """ 

129 subset: Set[str] 

130 """A set of task labels contained in this subset. 

131 """ 

132 description: Optional[str] 

133 """A description of what this subset of tasks is intended to do 

134 """ 

135 

136 @staticmethod 

137 def from_primitives(label: str, value: Union[List[str], dict]) -> LabeledSubset: 

138 """Generate `LabeledSubset` objects given a properly formatted object 

139 that as been created by a yaml loader. 

140 

141 Parameters 

142 ---------- 

143 label : `str` 

144 The label that will be used to identify this labeled subset. 

145 value : `list` of `str` or `dict` 

146 Object returned from loading a labeled subset section from a yaml 

147 document. 

148 

149 Returns 

150 ------- 

151 labeledSubset : `LabeledSubset` 

152 A `LabeledSubset` object build from the inputs. 

153 

154 Raises 

155 ------ 

156 ValueError 

157 Raised if the value input is not properly formatted for parsing 

158 """ 

159 if isinstance(value, MutableMapping): 

160 subset = value.pop("subset", None) 

161 if subset is None: 

162 raise ValueError( 

163 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

164 ) 

165 description = value.pop("description", None) 

166 elif isinstance(value, abcIterable): 

167 subset = value 

168 description = None 

169 else: 

170 raise ValueError( 

171 f"There was a problem parsing the labeled subset {label}, make sure the " 

172 "definition is either a valid yaml list, or a mapping with keys " 

173 "(subset, description) where subset points to a yaml list, and description is " 

174 "associated with a string" 

175 ) 

176 return LabeledSubset(label, set(subset), description) 

177 

178 def to_primitives(self) -> Dict[str, Union[List[str], str]]: 

179 """Convert to a representation used in yaml serialization""" 

180 accumulate: Dict[str, Union[List[str], str]] = {"subset": list(self.subset)} 

181 if self.description is not None: 

182 accumulate["description"] = self.description 

183 return accumulate 

184 

185 

186@dataclass 

187class ParametersIR: 

188 """Intermediate representation of parameters that are global to a pipeline 

189 

190 These parameters are specified under a top level key named `parameters` 

191 and are declared as a yaml mapping. These entries can then be used inside 

192 task configuration blocks to specify configuration values. They may not be 

193 used in the special ``file`` or ``python`` blocks. 

194 

195 Example: 

196 paramters: 

197 shared_value: 14 

198 tasks: 

199 taskA: 

200 class: modA 

201 config: 

202 field1: parameters.shared_value 

203 taskB: 

204 class: modB 

205 config: 

206 field2: parameters.shared_value 

207 """ 

208 

209 mapping: MutableMapping[str, str] 

210 """A mutable mapping of identifiers as keys, and shared configuration 

211 as values. 

212 """ 

213 

214 def update(self, other: Optional[ParametersIR]) -> None: 

215 if other is not None: 

216 self.mapping.update(other.mapping) 

217 

218 def to_primitives(self) -> MutableMapping[str, str]: 

219 """Convert to a representation used in yaml serialization""" 

220 return self.mapping 

221 

222 def __contains__(self, value: str) -> bool: 

223 return value in self.mapping 

224 

225 def __getitem__(self, item: str) -> Any: 

226 return self.mapping[item] 

227 

228 def __bool__(self) -> bool: 

229 return bool(self.mapping) 

230 

231 

232@dataclass 

233class ConfigIR: 

234 """Intermediate representation of configurations read from a pipeline yaml 

235 file. 

236 """ 

237 

238 python: Union[str, None] = None 

239 """A string of python code that is used to modify a configuration. This can 

240 also be None if there are no modifications to do. 

241 """ 

242 dataId: Union[dict, None] = None 

243 """A dataId that is used to constrain these config overrides to only quanta 

244 with matching dataIds. This field can be None if there is no constraint. 

245 This is currently an unimplemented feature, and is placed here for future 

246 use. 

247 """ 

248 file: List[str] = field(default_factory=list) 

249 """A list of paths which points to a file containing config overrides to be 

250 applied. This value may be an empty list if there are no overrides to 

251 apply. 

252 """ 

253 rest: dict = field(default_factory=dict) 

254 """This is a dictionary of key value pairs, where the keys are strings 

255 corresponding to qualified fields on a config to override, and the values 

256 are strings representing the values to apply. 

257 """ 

258 

259 def to_primitives(self) -> Dict[str, Union[str, dict, List[str]]]: 

260 """Convert to a representation used in yaml serialization""" 

261 accumulate = {} 

262 for name in ("python", "dataId", "file"): 

263 # if this attribute is thruthy add it to the accumulation 

264 # dictionary 

265 if getattr(self, name): 

266 accumulate[name] = getattr(self, name) 

267 # Add the dictionary containing the rest of the config keys to the 

268 # # accumulated dictionary 

269 accumulate.update(self.rest) 

270 return accumulate 

271 

272 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

273 """Returns a new ConfigIR object that is formatted according to the 

274 specified parameters 

275 

276 Parameters 

277 ---------- 

278 parameters : ParametersIR 

279 Object that contains variable mappings used in substitution. 

280 

281 Returns 

282 ------- 

283 config : ConfigIR 

284 A new ConfigIR object formatted with the input parameters 

285 """ 

286 new_config = copy.deepcopy(self) 

287 for key, value in new_config.rest.items(): 

288 if not isinstance(value, str): 

289 continue 

290 match = re.match("parameters[.](.*)", value) 

291 if match and match.group(1) in parameters: 

292 new_config.rest[key] = parameters[match.group(1)] 

293 if match and match.group(1) not in parameters: 

294 warnings.warn( 

295 f"config {key} contains value {match.group(0)} which is formatted like a " 

296 "Pipeline parameter but was not found within the Pipeline, if this was not " 

297 "intentional, check for a typo" 

298 ) 

299 return new_config 

300 

301 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

302 """Merges another instance of a `ConfigIR` into this instance if 

303 possible. This function returns a generator that is either self 

304 if the configs were merged, or self, and other_config if that could 

305 not be merged. 

306 

307 Parameters 

308 ---------- 

309 other_config : `ConfigIR` 

310 An instance of `ConfigIR` to merge into this instance. 

311 

312 Returns 

313 ------- 

314 Generator : `ConfigIR` 

315 A generator containing either self, or self and other_config if 

316 the configs could be merged or not respectively. 

317 """ 

318 # Verify that the config blocks can be merged 

319 if ( 

320 self.dataId != other_config.dataId 

321 or self.python 

322 or other_config.python 

323 or self.file 

324 or other_config.file 

325 ): 

326 yield from (self, other_config) 

327 return 

328 

329 # create a set of all keys, and verify two keys do not have different 

330 # values 

331 key_union = self.rest.keys() & other_config.rest.keys() 

332 for key in key_union: 

333 if self.rest[key] != other_config.rest[key]: 

334 yield from (self, other_config) 

335 return 

336 self.rest.update(other_config.rest) 

337 

338 # Combine the lists of override files to load 

339 self_file_set = set(self.file) 

340 other_file_set = set(other_config.file) 

341 self.file = list(self_file_set.union(other_file_set)) 

342 

343 yield self 

344 

345 def __eq__(self, other: object) -> bool: 

346 if not isinstance(other, ConfigIR): 

347 return False 

348 elif all( 

349 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

350 ): 

351 return True 

352 else: 

353 return False 

354 

355 

356@dataclass 

357class TaskIR: 

358 """Intermediate representation of tasks read from a pipeline yaml file.""" 

359 

360 label: str 

361 """An identifier used to refer to a task. 

362 """ 

363 klass: str 

364 """A string containing a fully qualified python class to be run in a 

365 pipeline. 

366 """ 

367 config: Union[List[ConfigIR], None] = None 

368 """List of all configs overrides associated with this task, and may be 

369 `None` if there are no config overrides. 

370 """ 

371 

372 def to_primitives(self) -> Dict[str, Union[str, List[dict]]]: 

373 """Convert to a representation used in yaml serialization""" 

374 accumulate: Dict[str, Union[str, List[dict]]] = {"class": self.klass} 

375 if self.config: 

376 accumulate["config"] = [c.to_primitives() for c in self.config] 

377 return accumulate 

378 

379 def add_or_update_config(self, other_config: ConfigIR) -> None: 

380 """Adds a `ConfigIR` to this task if one is not present. Merges configs 

381 if there is a `ConfigIR` present and the dataId keys of both configs 

382 match, otherwise adds a new entry to the config list. The exception to 

383 the above is that if either the last config or other_config has a 

384 python block, then other_config is always added, as python blocks can 

385 modify configs in ways that cannot be predicted. 

386 

387 Parameters 

388 ---------- 

389 other_config : `ConfigIR` 

390 A `ConfigIR` instance to add or merge into the config attribute of 

391 this task. 

392 """ 

393 if not self.config: 

394 self.config = [other_config] 

395 return 

396 self.config.extend(self.config.pop().maybe_merge(other_config)) 

397 

398 def __eq__(self, other: object) -> bool: 

399 if not isinstance(other, TaskIR): 

400 return False 

401 elif all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")): 

402 return True 

403 else: 

404 return False 

405 

406 

407@dataclass 

408class ImportIR: 

409 """An intermediate representation of imported pipelines""" 

410 

411 location: str 

412 """This is the location of the pipeline to inherit. The path should be 

413 specified as an absolute path. Environment variables may be used in the 

414 path and should be specified as a python string template, with the name of 

415 the environment variable inside braces. 

416 """ 

417 include: Union[List[str], None] = None 

418 """List of tasks that should be included when inheriting this pipeline. 

419 Either the include or exclude attributes may be specified, but not both. 

420 """ 

421 exclude: Union[List[str], None] = None 

422 """List of tasks that should be excluded when inheriting this pipeline. 

423 Either the include or exclude attributes may be specified, but not both. 

424 """ 

425 importContracts: bool = True 

426 """Boolean attribute to dictate if contracts should be inherited with the 

427 pipeline or not. 

428 """ 

429 instrument: Union[Literal[_Tags.KeepInstrument], str, None] = _Tags.KeepInstrument 

430 """Instrument to assign to the Pipeline at import. The default value of 

431 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

432 declared with will not be modified. Setting this value to None will drop 

433 any declared instrument prior to import. 

434 """ 

435 

436 def toPipelineIR(self) -> "PipelineIR": 

437 """Load in the Pipeline specified by this object, and turn it into a 

438 PipelineIR instance. 

439 

440 Returns 

441 ------- 

442 pipeline : `PipelineIR` 

443 A pipeline generated from the imported pipeline file 

444 """ 

445 if self.include and self.exclude: 

446 raise ValueError( 

447 "Both an include and an exclude list cant be specified when declaring a pipeline import" 

448 ) 

449 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

450 if self.instrument is not _Tags.KeepInstrument: 

451 tmp_pipeline.instrument = self.instrument 

452 

453 included_labels = set() 

454 for label in tmp_pipeline.tasks: 

455 if ( 

456 (self.include and label in self.include) 

457 or (self.exclude and label not in self.exclude) 

458 or (self.include is None and self.exclude is None) 

459 ): 

460 included_labels.add(label) 

461 

462 # Handle labeled subsets being specified in the include or exclude 

463 # list, adding or removing labels. 

464 if self.include is not None: 

465 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

466 for label in subsets_in_include: 

467 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

468 

469 elif self.exclude is not None: 

470 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

471 for label in subsets_in_exclude: 

472 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

473 

474 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels) 

475 

476 if not self.importContracts: 

477 tmp_pipeline.contracts = [] 

478 

479 return tmp_pipeline 

480 

481 def __eq__(self, other: object) -> bool: 

482 if not isinstance(other, ImportIR): 

483 return False 

484 elif all( 

485 getattr(self, attr) == getattr(other, attr) 

486 for attr in ("location", "include", "exclude", "importContracts") 

487 ): 

488 return True 

489 else: 

490 return False 

491 

492 

493class PipelineIR: 

494 """Intermediate representation of a pipeline definition 

495 

496 Parameters 

497 ---------- 

498 loaded_yaml : `dict` 

499 A dictionary which matches the structure that would be produced by a 

500 yaml reader which parses a pipeline definition document 

501 

502 Raises 

503 ------ 

504 ValueError : 

505 - If a pipeline is declared without a description 

506 - If no tasks are declared in a pipeline, and no pipelines are to be 

507 inherited 

508 - If more than one instrument is specified 

509 - If more than one inherited pipeline share a label 

510 """ 

511 

512 def __init__(self, loaded_yaml: Dict[str, Any]): 

513 # Check required fields are present 

514 if "description" not in loaded_yaml: 

515 raise ValueError("A pipeline must be declared with a description") 

516 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

517 raise ValueError("A pipeline must be declared with one or more tasks") 

518 

519 # These steps below must happen in this call order 

520 

521 # Process pipeline description 

522 self.description = loaded_yaml.pop("description") 

523 

524 # Process tasks 

525 self._read_tasks(loaded_yaml) 

526 

527 # Process instrument keys 

528 inst = loaded_yaml.pop("instrument", None) 

529 if isinstance(inst, list): 

530 raise ValueError("Only one top level instrument can be defined in a pipeline") 

531 self.instrument: Optional[str] = inst 

532 

533 # Process any contracts 

534 self._read_contracts(loaded_yaml) 

535 

536 # Process any defined parameters 

537 self._read_parameters(loaded_yaml) 

538 

539 # Process any named label subsets 

540 self._read_labeled_subsets(loaded_yaml) 

541 

542 # Process any inherited pipelines 

543 self._read_imports(loaded_yaml) 

544 

545 # verify named subsets, must be done after inheriting 

546 self._verify_labeled_subsets() 

547 

548 def _read_contracts(self, loaded_yaml: Dict[str, Any]) -> None: 

549 """Process the contracts portion of the loaded yaml document 

550 

551 Parameters 

552 --------- 

553 loaded_yaml : `dict` 

554 A dictionary which matches the structure that would be produced by 

555 a yaml reader which parses a pipeline definition document 

556 """ 

557 loaded_contracts = loaded_yaml.pop("contracts", []) 

558 if isinstance(loaded_contracts, str): 

559 loaded_contracts = [loaded_contracts] 

560 self.contracts: List[ContractIR] = [] 

561 for contract in loaded_contracts: 

562 if isinstance(contract, dict): 

563 self.contracts.append(ContractIR(**contract)) 

564 if isinstance(contract, str): 

565 self.contracts.append(ContractIR(contract=contract)) 

566 

567 def _read_parameters(self, loaded_yaml: Dict[str, Any]) -> None: 

568 """Process the parameters portion of the loaded yaml document 

569 

570 Parameters 

571 --------- 

572 loaded_yaml : `dict` 

573 A dictionary which matches the structure that would be produced by 

574 a yaml reader which parses a pipeline definition document 

575 """ 

576 loaded_parameters = loaded_yaml.pop("parameters", {}) 

577 if not isinstance(loaded_parameters, dict): 

578 raise ValueError("The parameters section must be a yaml mapping") 

579 self.parameters = ParametersIR(loaded_parameters) 

580 

581 def _read_labeled_subsets(self, loaded_yaml: Dict[str, Any]) -> None: 

582 """Process the subsets portion of the loaded yaml document 

583 

584 Parameters 

585 ---------- 

586 loaded_yaml: `MutableMapping` 

587 A dictionary which matches the structure that would be produced 

588 by a yaml reader which parses a pipeline definition document 

589 """ 

590 loaded_subsets = loaded_yaml.pop("subsets", {}) 

591 self.labeled_subsets: Dict[str, LabeledSubset] = {} 

592 if not loaded_subsets and "subset" in loaded_yaml: 

593 raise ValueError("Top level key should be subsets and not subset, add an s") 

594 for key, value in loaded_subsets.items(): 

595 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

596 

597 def _verify_labeled_subsets(self) -> None: 

598 """Verifies that all the labels in each named subset exist within the 

599 pipeline. 

600 """ 

601 # Verify that all labels defined in a labeled subset are in the 

602 # Pipeline 

603 for labeled_subset in self.labeled_subsets.values(): 

604 if not labeled_subset.subset.issubset(self.tasks.keys()): 

605 raise ValueError( 

606 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

607 "declared pipeline" 

608 ) 

609 # Verify subset labels are not already task labels 

610 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

611 if label_intersection: 

612 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

613 

614 def _read_imports(self, loaded_yaml: Dict[str, Any]) -> None: 

615 """Process the inherits portion of the loaded yaml document 

616 

617 Parameters 

618 --------- 

619 loaded_yaml : `dict` 

620 A dictionary which matches the structure that would be produced by 

621 a yaml reader which parses a pipeline definition document 

622 """ 

623 

624 def process_args(argument: Union[str, dict]) -> dict: 

625 if isinstance(argument, str): 

626 return {"location": argument} 

627 elif isinstance(argument, dict): 

628 if "exclude" in argument and isinstance(argument["exclude"], str): 

629 argument["exclude"] = [argument["exclude"]] 

630 if "include" in argument and isinstance(argument["include"], str): 

631 argument["include"] = [argument["include"]] 

632 if "instrument" in argument and argument["instrument"] == "None": 

633 argument["instrument"] = None 

634 return argument 

635 

636 if not {"inherits", "imports"} - loaded_yaml.keys(): 

637 raise ValueError("Cannot define both inherits and imports sections, use imports") 

638 tmp_import = loaded_yaml.pop("inherits", None) 

639 if tmp_import is None: 

640 tmp_import = loaded_yaml.pop("imports", None) 

641 else: 

642 warnings.warn( 

643 "The 'inherits' key is deprecated, and will be " 

644 "removed around June 2021. Please use the key " 

645 "'imports' instead" 

646 ) 

647 if tmp_import is None: 

648 self.imports: List[ImportIR] = [] 

649 elif isinstance(tmp_import, list): 

650 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

651 else: 

652 self.imports = [ImportIR(**process_args(tmp_import))] 

653 

654 # integrate any imported pipelines 

655 accumulate_tasks: Dict[str, TaskIR] = {} 

656 accumulate_labeled_subsets: Dict[str, LabeledSubset] = {} 

657 accumulated_parameters = ParametersIR({}) 

658 for other_pipeline in self.imports: 

659 tmp_IR = other_pipeline.toPipelineIR() 

660 if self.instrument is None: 

661 self.instrument = tmp_IR.instrument 

662 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

663 msg = ( 

664 "Only one instrument can be declared in a pipeline or its imports. " 

665 f"Top level pipeline defines {self.instrument} but {other_pipeline.location} " 

666 f"defines {tmp_IR.instrument}." 

667 ) 

668 raise ValueError(msg) 

669 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

670 msg = ( 

671 "Task labels in the imported pipelines must be unique. " 

672 f"These labels appear multiple times: {duplicate_labels}" 

673 ) 

674 raise ValueError(msg) 

675 accumulate_tasks.update(tmp_IR.tasks) 

676 self.contracts.extend(tmp_IR.contracts) 

677 # verify that tmp_IR has unique labels for named subset among 

678 # existing labeled subsets, and with existing task labels. 

679 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

680 task_subset_overlap = ( 

681 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

682 ) & accumulate_tasks.keys() 

683 if overlapping_subsets or task_subset_overlap: 

684 raise ValueError( 

685 "Labeled subset names must be unique amongst imports in both labels and " 

686 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

687 ) 

688 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

689 accumulated_parameters.update(tmp_IR.parameters) 

690 

691 # verify that any accumulated labeled subsets dont clash with a label 

692 # from this pipeline 

693 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

694 raise ValueError( 

695 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

696 ) 

697 # merge in the named subsets for self so this document can override any 

698 # that have been delcared 

699 accumulate_labeled_subsets.update(self.labeled_subsets) 

700 self.labeled_subsets = accumulate_labeled_subsets 

701 

702 # merge the dict of label:TaskIR objects, preserving any configs in the 

703 # imported pipeline if the labels point to the same class 

704 for label, task in self.tasks.items(): 

705 if label not in accumulate_tasks: 

706 accumulate_tasks[label] = task 

707 elif accumulate_tasks[label].klass == task.klass: 

708 if task.config is not None: 

709 for config in task.config: 

710 accumulate_tasks[label].add_or_update_config(config) 

711 else: 

712 accumulate_tasks[label] = task 

713 self.tasks: Dict[str, TaskIR] = accumulate_tasks 

714 accumulated_parameters.update(self.parameters) 

715 self.parameters = accumulated_parameters 

716 

717 def _read_tasks(self, loaded_yaml: Dict[str, Any]) -> None: 

718 """Process the tasks portion of the loaded yaml document 

719 

720 Parameters 

721 --------- 

722 loaded_yaml : `dict` 

723 A dictionary which matches the structure that would be produced by 

724 a yaml reader which parses a pipeline definition document 

725 """ 

726 self.tasks = {} 

727 tmp_tasks = loaded_yaml.pop("tasks", None) 

728 if tmp_tasks is None: 

729 tmp_tasks = {} 

730 

731 if "parameters" in tmp_tasks: 

732 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

733 

734 for label, definition in tmp_tasks.items(): 

735 if isinstance(definition, str): 

736 definition = {"class": definition} 

737 config = definition.get("config", None) 

738 if config is None: 

739 task_config_ir = None 

740 else: 

741 if isinstance(config, dict): 

742 config = [config] 

743 task_config_ir = [] 

744 for c in config: 

745 file = c.pop("file", None) 

746 if file is None: 

747 file = [] 

748 elif not isinstance(file, list): 

749 file = [file] 

750 task_config_ir.append( 

751 ConfigIR( 

752 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

753 ) 

754 ) 

755 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

756 

757 def _remove_contracts(self, label: str) -> None: 

758 """Remove any contracts that contain the given label 

759 

760 String comparison used in this way is not the most elegant and may 

761 have issues, but it is the only feasible way when users can specify 

762 contracts with generic strings. 

763 """ 

764 new_contracts = [] 

765 for contract in self.contracts: 

766 # match a label that is not preceded by an ASCII identifier, or 

767 # is the start of a line and is followed by a dot 

768 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

769 continue 

770 new_contracts.append(contract) 

771 self.contracts = new_contracts 

772 

773 def subset_from_labels(self, labelSpecifier: Set[str]) -> PipelineIR: 

774 """Subset a pipelineIR to contain only labels specified in 

775 labelSpecifier. 

776 

777 Parameters 

778 ---------- 

779 labelSpecifier : `set` of `str` 

780 Set containing labels that describes how to subset a pipeline. 

781 

782 Returns 

783 ------- 

784 pipeline : `PipelineIR` 

785 A new pipelineIR object that is a subset of the old pipelineIR 

786 

787 Raises 

788 ------ 

789 ValueError 

790 Raised if there is an issue with specified labels 

791 

792 Notes 

793 ----- 

794 This method attempts to prune any contracts that contain labels which 

795 are not in the declared subset of labels. This pruning is done using a 

796 string based matching due to the nature of contracts and may prune more 

797 than it should. Any labeled subsets defined that no longer have all 

798 members of the subset present in the pipeline will be removed from the 

799 resulting pipeline. 

800 """ 

801 

802 pipeline = copy.deepcopy(self) 

803 

804 # update the label specifier to expand any named subsets 

805 toRemove = set() 

806 toAdd = set() 

807 for label in labelSpecifier: 

808 if label in pipeline.labeled_subsets: 

809 toRemove.add(label) 

810 toAdd.update(pipeline.labeled_subsets[label].subset) 

811 labelSpecifier.difference_update(toRemove) 

812 labelSpecifier.update(toAdd) 

813 # verify all the labels are in the pipeline 

814 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

815 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

816 raise ValueError( 

817 "Not all supplied labels (specified or named subsets) are in the pipeline " 

818 f"definition, extra labels: {difference}" 

819 ) 

820 # copy needed so as to not modify while iterating 

821 pipeline_labels = set(pipeline.tasks.keys()) 

822 # Remove the labels from the pipelineIR, and any contracts that contain 

823 # those labels (see docstring on _remove_contracts for why this may 

824 # cause issues) 

825 for label in pipeline_labels: 

826 if label not in labelSpecifier: 

827 pipeline.tasks.pop(label) 

828 pipeline._remove_contracts(label) 

829 

830 # create a copy of the object to iterate over 

831 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

832 # remove any labeled subsets that no longer have a complete set 

833 for label, labeled_subset in labeled_subsets.items(): 

834 if labeled_subset.subset - pipeline.tasks.keys(): 

835 pipeline.labeled_subsets.pop(label) 

836 

837 return pipeline 

838 

839 @classmethod 

840 def from_string(cls, pipeline_string: str) -> PipelineIR: 

841 """Create a `PipelineIR` object from a string formatted like a pipeline 

842 document 

843 

844 Parameters 

845 ---------- 

846 pipeline_string : `str` 

847 A string that is formatted according like a pipeline document 

848 """ 

849 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

850 return cls(loaded_yaml) 

851 

852 @classmethod 

853 @deprecated( 

854 reason="This has been replaced with `from_uri`. will be removed after v23", 

855 version="v21.0,", 

856 category=FutureWarning, 

857 ) 

858 def from_file(cls, filename: str) -> PipelineIR: 

859 """Create a `PipelineIR` object from the document specified by the 

860 input path. 

861 

862 Parameters 

863 ---------- 

864 filename : `str` 

865 Location of document to use in creating a `PipelineIR` object. 

866 

867 Returns 

868 ------- 

869 pipelineIR : `PipelineIR` 

870 The loaded pipeline 

871 

872 Note 

873 ---- 

874 This method is deprecated, please use from_uri 

875 """ 

876 return cls.from_uri(filename) 

877 

878 @classmethod 

879 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

880 """Create a `PipelineIR` object from the document specified by the 

881 input uri. 

882 

883 Parameters 

884 ---------- 

885 uri: convertible to `ResourcePath` 

886 Location of document to use in creating a `PipelineIR` object. 

887 

888 Returns 

889 ------- 

890 pipelineIR : `PipelineIR` 

891 The loaded pipeline 

892 """ 

893 loaded_uri = ResourcePath(uri) 

894 with loaded_uri.open("r") as buffer: 

895 # explicitly read here, there was some issue with yaml trying 

896 # to read the ResourcePath itself (I think because it only 

897 # pretends to be conformant to the io api) 

898 loaded_yaml = yaml.load(buffer.read(), Loader=PipelineYamlLoader) 

899 return cls(loaded_yaml) 

900 

901 @deprecated( 

902 reason="This has been replaced with `write_to_uri`. will be removed after v23", 

903 version="v21.0,", 

904 category=FutureWarning, 

905 ) # type: ignore 

906 def to_file(self, filename: str): 

907 """Serialize this `PipelineIR` object into a yaml formatted string and 

908 write the output to a file at the specified path. 

909 

910 Parameters 

911 ---------- 

912 filename : `str` 

913 Location of document to write a `PipelineIR` object. 

914 """ 

915 self.write_to_uri(filename) 

916 

917 def write_to_uri( 

918 self, 

919 uri: ResourcePathExpression, 

920 ) -> None: 

921 """Serialize this `PipelineIR` object into a yaml formatted string and 

922 write the output to a file at the specified uri. 

923 

924 Parameters 

925 ---------- 

926 uri: convertible to `ResourcePath` 

927 Location of document to write a `PipelineIR` object. 

928 """ 

929 with ResourcePath(uri).open("w") as buffer: 

930 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

931 

932 def to_primitives(self) -> Dict[str, Any]: 

933 """Convert to a representation used in yaml serialization""" 

934 accumulate = {"description": self.description} 

935 if self.instrument is not None: 

936 accumulate["instrument"] = self.instrument 

937 if self.parameters: 

938 accumulate["parameters"] = self._sort_by_str(self.parameters.to_primitives()) 

939 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

940 if len(self.contracts) > 0: 

941 # sort contracts lexicographical order by the contract string in 

942 # absence of any other ordering principle 

943 contracts_list = [c.to_primitives() for c in self.contracts] 

944 contracts_list.sort(key=lambda x: x["contract"]) 

945 accumulate["contracts"] = contracts_list 

946 if self.labeled_subsets: 

947 accumulate["subsets"] = self._sort_by_str( 

948 {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

949 ) 

950 return accumulate 

951 

952 def reorder_tasks(self, task_labels: List[str]) -> None: 

953 """Changes the order tasks are stored internally. Useful for 

954 determining the order things will appear in the serialized (or printed) 

955 form. 

956 

957 Parameters 

958 ---------- 

959 task_labels : `list` of `str` 

960 A list corresponding to all the labels in the pipeline inserted in 

961 the order the tasks are to be stored. 

962 

963 Raises 

964 ------ 

965 KeyError 

966 Raised if labels are supplied that are not in the pipeline, or if 

967 not all labels in the pipeline were supplied in task_labels input. 

968 """ 

969 # verify that all labels are in the input 

970 _tmp_set = set(task_labels) 

971 if remainder := (self.tasks.keys() - _tmp_set): 

972 raise KeyError(f"Label(s) {remainder} are missing from the task label list") 

973 if extra := (_tmp_set - self.tasks.keys()): 

974 raise KeyError(f"Extra label(s) {extra} were in the input and are not in the pipeline") 

975 

976 newTasks = {key: self.tasks[key] for key in task_labels} 

977 self.tasks = newTasks 

978 

979 @staticmethod 

980 def _sort_by_str(arg: Mapping[str, Any]) -> Mapping[str, Any]: 

981 keys = sorted(arg.keys()) 

982 return {key: arg[key] for key in keys} 

983 

984 def __str__(self) -> str: 

985 """Instance formatting as how it would look in yaml representation""" 

986 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

987 

988 def __repr__(self) -> str: 

989 """Instance formatting as how it would look in yaml representation""" 

990 return str(self) 

991 

992 def __eq__(self, other: object) -> bool: 

993 if not isinstance(other, PipelineIR): 

994 return False 

995 # special case contracts because it is a list, but order is not 

996 # important 

997 elif ( 

998 all( 

999 getattr(self, attr) == getattr(other, attr) 

1000 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

1001 ) 

1002 and len(self.contracts) == len(other.contracts) 

1003 and all(c in self.contracts for c in other.contracts) 

1004 ): 

1005 return True 

1006 else: 

1007 return False