Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("ConfigIR", "ContractError", "ContractIR", "ImportIR", "PipelineIR", "TaskIR", "LabeledSubset") 

24 

25from collections import Counter 

26from collections.abc import Iterable as abcIterable 

27from dataclasses import dataclass, field 

28from typing import Any, List, Set, Union, Generator, MutableMapping, Optional, Dict, Type 

29 

30import copy 

31import re 

32import os 

33import yaml 

34import warnings 

35 

36 

37class KeepInstrument: 

38 pass 

39 

40 

41class PipelineYamlLoader(yaml.SafeLoader): 

42 """This is a specialized version of yaml's SafeLoader. It checks and raises 

43 an exception if it finds that there are multiple instances of the same key 

44 found inside a pipeline file at a given scope. 

45 """ 

46 def construct_mapping(self, node, deep=False): 

47 # do the call to super first so that it can do all the other forms of 

48 # checking on this node. If you check the uniqueness of keys first 

49 # it would save the work that super does in the case of a failure, but 

50 # it might fail in the case that the node was the incorrect node due 

51 # to a parsing error, and the resulting exception would be difficult to 

52 # understand. 

53 mapping = super().construct_mapping(node, deep) 

54 # Check if there are any duplicate keys 

55 all_keys = Counter(key_node.value for key_node, _ in node.value) 

56 duplicates = {k for k, i in all_keys.items() if i != 1} 

57 if duplicates: 

58 raise KeyError("Pipeline files must not have duplicated keys, " 

59 f"{duplicates} appeared multiple times") 

60 return mapping 

61 

62 

63class ContractError(Exception): 

64 """An exception that is raised when a pipeline contract is not satisfied 

65 """ 

66 pass 

67 

68 

69@dataclass 

70class ContractIR: 

71 """Intermediate representation of contracts read from a pipeline yaml file. 

72 """ 

73 contract: str 

74 """A string of python code representing one or more conditions on configs 

75 in a pipeline. This code-as-string should, once evaluated, should be True 

76 if the configs are fine, and False otherwise. 

77 """ 

78 msg: Union[str, None] = None 

79 """An optional message to be shown to the user if a contract fails 

80 """ 

81 

82 def to_primitives(self) -> dict: 

83 """Convert to a representation used in yaml serialization 

84 """ 

85 accumulate = {"contract": self.contract} 

86 if self.msg is not None: 

87 accumulate['msg'] = self.msg 

88 return accumulate 

89 

90 def __eq__(self, other: "ContractIR"): 

91 if not isinstance(other, ContractIR): 

92 return False 

93 elif self.contract == other.contract and self.msg == other.msg: 

94 return True 

95 else: 

96 return False 

97 

98 

99@dataclass 

100class LabeledSubset: 

101 """Intermediate representation of named subset of task labels read from 

102 a pipeline yaml file. 

103 """ 

104 label: str 

105 """The label used to identify the subset of task labels. 

106 """ 

107 subset: Set[str] 

108 """A set of task labels contained in this subset. 

109 """ 

110 description: Optional[str] 

111 """A description of what this subset of tasks is intended to do 

112 """ 

113 

114 @staticmethod 

115 def from_primatives(label: str, value: Union[List[str], dict]) -> LabeledSubset: 

116 """Generate `LabeledSubset` objects given a properly formatted object 

117 that as been created by a yaml loader. 

118 

119 Parameters 

120 ---------- 

121 label : `str` 

122 The label that will be used to identify this labeled subset. 

123 value : `list` of `str` or `dict` 

124 Object returned from loading a labeled subset section from a yaml 

125 document. 

126 

127 Returns 

128 ------- 

129 labeledSubset : `LabeledSubset` 

130 A `LabeledSubset` object build from the inputs. 

131 

132 Raises 

133 ------ 

134 ValueError 

135 Raised if the value input is not properly formatted for parsing 

136 """ 

137 if isinstance(value, MutableMapping): 

138 subset = value.pop("subset", None) 

139 if subset is None: 

140 raise ValueError("If a labeled subset is specified as a mapping, it must contain the key " 

141 "'subset'") 

142 description = value.pop("description", None) 

143 elif isinstance(value, abcIterable): 

144 subset = value 

145 description = None 

146 else: 

147 raise ValueError(f"There was a problem parsing the labeled subset {label}, make sure the " 

148 "definition is either a valid yaml list, or a mapping with keys " 

149 "(subset, description) where subset points to a yaml list, and description is " 

150 "associated with a string") 

151 return LabeledSubset(label, set(subset), description) 

152 

153 def to_primitives(self) -> dict: 

154 """Convert to a representation used in yaml serialization 

155 """ 

156 accumulate: Dict[str, Any] = {"subset": list(self.subset)} 

157 if self.description is not None: 

158 accumulate["description"] = self.description 

159 return accumulate 

160 

161 

162@dataclass 

163class ParametersIR: 

164 """Intermediate representation of parameters that are global to a pipeline 

165 

166 These parameters are specified under a top level key named `parameters` 

167 and are declared as a yaml mapping. These entries can then be used inside 

168 task configuration blocks to specify configuration values. They may not be 

169 used in the special ``file`` or ``python`` blocks. 

170 

171 Example: 

172 paramters: 

173 shared_value: 14 

174 tasks: 

175 taskA: 

176 class: modA 

177 config: 

178 field1: parameters.shared_value 

179 taskB: 

180 class: modB 

181 config: 

182 field2: parameters.shared_value 

183 """ 

184 mapping: MutableMapping[str, str] 

185 """A mutable mapping of identifiers as keys, and shared configuration 

186 as values. 

187 """ 

188 def update(self, other: Optional[ParametersIR]): 

189 if other is not None: 

190 self.mapping.update(other.mapping) 

191 

192 def to_primitives(self) -> MutableMapping[str, str]: 

193 """Convert to a representation used in yaml serialization 

194 """ 

195 return self.mapping 

196 

197 def __contains__(self, value: str) -> bool: 

198 return value in self.mapping 

199 

200 def __getitem__(self, item: str) -> Any: 

201 return self.mapping[item] 

202 

203 def __bool__(self) -> bool: 

204 return bool(self.mapping) 

205 

206 

207@dataclass 

208class ConfigIR: 

209 """Intermediate representation of configurations read from a pipeline yaml 

210 file. 

211 """ 

212 python: Union[str, None] = None 

213 """A string of python code that is used to modify a configuration. This can 

214 also be None if there are no modifications to do. 

215 """ 

216 dataId: Union[dict, None] = None 

217 """A dataId that is used to constrain these config overrides to only quanta 

218 with matching dataIds. This field can be None if there is no constraint. 

219 This is currently an unimplemented feature, and is placed here for future 

220 use. 

221 """ 

222 file: List[str] = field(default_factory=list) 

223 """A list of paths which points to a file containing config overrides to be 

224 applied. This value may be an empty list if there are no overrides to 

225 apply. 

226 """ 

227 rest: dict = field(default_factory=dict) 

228 """This is a dictionary of key value pairs, where the keys are strings 

229 corresponding to qualified fields on a config to override, and the values 

230 are strings representing the values to apply. 

231 """ 

232 

233 def to_primitives(self) -> dict: 

234 """Convert to a representation used in yaml serialization 

235 """ 

236 accumulate = {} 

237 for name in ("python", "dataId", "file"): 

238 # if this attribute is thruthy add it to the accumulation 

239 # dictionary 

240 if getattr(self, name): 

241 accumulate[name] = getattr(self, name) 

242 # Add the dictionary containing the rest of the config keys to the 

243 # # accumulated dictionary 

244 accumulate.update(self.rest) 

245 return accumulate 

246 

247 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

248 """Returns a new ConfigIR object that is formatted according to the 

249 specified parameters 

250 

251 Parameters 

252 ---------- 

253 parameters : ParametersIR 

254 Object that contains variable mappings used in substitution. 

255 

256 Returns 

257 ------- 

258 config : ConfigIR 

259 A new ConfigIR object formatted with the input parameters 

260 """ 

261 new_config = copy.deepcopy(self) 

262 for key, value in new_config.rest.items(): 

263 if not isinstance(value, str): 

264 continue 

265 match = re.match("parameters[.](.*)", value) 

266 if match and match.group(1) in parameters: 

267 new_config.rest[key] = parameters[match.group(1)] 

268 if match and match.group(1) not in parameters: 

269 warnings.warn(f"config {key} contains value {match.group(0)} which is formatted like a " 

270 "Pipeline parameter but was not found within the Pipeline, if this was not " 

271 "intentional, check for a typo") 

272 return new_config 

273 

274 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

275 """Merges another instance of a `ConfigIR` into this instance if 

276 possible. This function returns a generator that is either self 

277 if the configs were merged, or self, and other_config if that could 

278 not be merged. 

279 

280 Parameters 

281 ---------- 

282 other_config : `ConfigIR` 

283 An instance of `ConfigIR` to merge into this instance. 

284 

285 Returns 

286 ------- 

287 Generator : `ConfigIR` 

288 A generator containing either self, or self and other_config if 

289 the configs could be merged or not respectively. 

290 """ 

291 # Verify that the config blocks can be merged 

292 if self.dataId != other_config.dataId or self.python or other_config.python or\ 

293 self.file or other_config.file: 

294 yield from (self, other_config) 

295 return 

296 

297 # create a set of all keys, and verify two keys do not have different 

298 # values 

299 key_union = self.rest.keys() & other_config.rest.keys() 

300 for key in key_union: 

301 if self.rest[key] != other_config.rest[key]: 

302 yield from (self, other_config) 

303 return 

304 self.rest.update(other_config.rest) 

305 

306 # Combine the lists of override files to load 

307 self_file_set = set(self.file) 

308 other_file_set = set(other_config.file) 

309 self.file = list(self_file_set.union(other_file_set)) 

310 

311 yield self 

312 

313 def __eq__(self, other: "ConfigIR"): 

314 if not isinstance(other, ConfigIR): 

315 return False 

316 elif all(getattr(self, attr) == getattr(other, attr) for attr in 

317 ("python", "dataId", "file", "rest")): 

318 return True 

319 else: 

320 return False 

321 

322 

323@dataclass 

324class TaskIR: 

325 """Intermediate representation of tasks read from a pipeline yaml file. 

326 """ 

327 label: str 

328 """An identifier used to refer to a task. 

329 """ 

330 klass: str 

331 """A string containing a fully qualified python class to be run in a 

332 pipeline. 

333 """ 

334 config: Union[List[ConfigIR], None] = None 

335 """List of all configs overrides associated with this task, and may be 

336 `None` if there are no config overrides. 

337 """ 

338 

339 def to_primitives(self) -> dict: 

340 """Convert to a representation used in yaml serialization 

341 """ 

342 accumulate = {'class': self.klass} 

343 if self.config: 

344 accumulate['config'] = [c.to_primitives() for c in self.config] 

345 return accumulate 

346 

347 def add_or_update_config(self, other_config: ConfigIR): 

348 """Adds a `ConfigIR` to this task if one is not present. Merges configs 

349 if there is a `ConfigIR` present and the dataId keys of both configs 

350 match, otherwise adds a new entry to the config list. The exception to 

351 the above is that if either the last config or other_config has a 

352 python block, then other_config is always added, as python blocks can 

353 modify configs in ways that cannot be predicted. 

354 

355 Parameters 

356 ---------- 

357 other_config : `ConfigIR` 

358 A `ConfigIR` instance to add or merge into the config attribute of 

359 this task. 

360 """ 

361 if not self.config: 

362 self.config = [other_config] 

363 return 

364 self.config.extend(self.config.pop().maybe_merge(other_config)) 

365 

366 def __eq__(self, other: "TaskIR"): 

367 if not isinstance(other, TaskIR): 

368 return False 

369 elif all(getattr(self, attr) == getattr(other, attr) for attr in 

370 ("label", "klass", "config")): 

371 return True 

372 else: 

373 return False 

374 

375 

376@dataclass 

377class ImportIR: 

378 """An intermediate representation of imported pipelines 

379 """ 

380 location: str 

381 """This is the location of the pipeline to inherit. The path should be 

382 specified as an absolute path. Environment variables may be used in the 

383 path and should be specified as a python string template, with the name of 

384 the environment variable inside braces. 

385 """ 

386 include: Union[List[str], None] = None 

387 """List of tasks that should be included when inheriting this pipeline. 

388 Either the include or exclude attributes may be specified, but not both. 

389 """ 

390 exclude: Union[List[str], None] = None 

391 """List of tasks that should be excluded when inheriting this pipeline. 

392 Either the include or exclude attributes may be specified, but not both. 

393 """ 

394 importContracts: bool = True 

395 """Boolean attribute to dictate if contracts should be inherited with the 

396 pipeline or not. 

397 """ 

398 instrument: Union[Type[KeepInstrument], str, None] = KeepInstrument 

399 """Instrument to assign to the Pipeline at import. The default value of 

400 KEEP_INSTRUMENT indicates that whatever instrument the pipeline is declared 

401 with will not be modified. Setting this value to None will drop any 

402 declared instrument prior to import. 

403 """ 

404 

405 def toPipelineIR(self) -> "PipelineIR": 

406 """Load in the Pipeline specified by this object, and turn it into a 

407 PipelineIR instance. 

408 

409 Returns 

410 ------- 

411 pipeline : `PipelineIR` 

412 A pipeline generated from the imported pipeline file 

413 """ 

414 if self.include and self.exclude: 

415 raise ValueError("Both an include and an exclude list cant be specified" 

416 " when declaring a pipeline import") 

417 tmp_pipeline = PipelineIR.from_file(os.path.expandvars(self.location)) 

418 if self.instrument is not KeepInstrument: 

419 tmp_pipeline.instrument = self.instrument 

420 

421 included_labels = set() 

422 for label in tmp_pipeline.tasks: 

423 if (self.include and label in self.include) or (self.exclude and label not in self.exclude)\ 

424 or (self.include is None and self.exclude is None): 

425 included_labels.add(label) 

426 

427 # Handle labeled subsets being specified in the include or exclude 

428 # list, adding or removing labels. 

429 if self.include is not None: 

430 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

431 for label in subsets_in_include: 

432 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

433 

434 elif self.exclude is not None: 

435 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

436 for label in subsets_in_exclude: 

437 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

438 

439 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels) 

440 

441 if not self.importContracts: 

442 tmp_pipeline.contracts = [] 

443 

444 return tmp_pipeline 

445 

446 def __eq__(self, other: "ImportIR"): 

447 if not isinstance(other, ImportIR): 

448 return False 

449 elif all(getattr(self, attr) == getattr(other, attr) for attr in 

450 ("location", "include", "exclude", "importContracts")): 

451 return True 

452 else: 

453 return False 

454 

455 

456class PipelineIR: 

457 """Intermediate representation of a pipeline definition 

458 

459 Parameters 

460 ---------- 

461 loaded_yaml : `dict` 

462 A dictionary which matches the structure that would be produced by a 

463 yaml reader which parses a pipeline definition document 

464 

465 Raises 

466 ------ 

467 ValueError : 

468 - If a pipeline is declared without a description 

469 - If no tasks are declared in a pipeline, and no pipelines are to be 

470 inherited 

471 - If more than one instrument is specified 

472 - If more than one inherited pipeline share a label 

473 """ 

474 def __init__(self, loaded_yaml): 

475 # Check required fields are present 

476 if "description" not in loaded_yaml: 

477 raise ValueError("A pipeline must be declared with a description") 

478 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

479 raise ValueError("A pipeline must be declared with one or more tasks") 

480 

481 # These steps below must happen in this call order 

482 

483 # Process pipeline description 

484 self.description = loaded_yaml.pop("description") 

485 

486 # Process tasks 

487 self._read_tasks(loaded_yaml) 

488 

489 # Process instrument keys 

490 inst = loaded_yaml.pop("instrument", None) 

491 if isinstance(inst, list): 

492 raise ValueError("Only one top level instrument can be defined in a pipeline") 

493 self.instrument = inst 

494 

495 # Process any contracts 

496 self._read_contracts(loaded_yaml) 

497 

498 # Process any defined parameters 

499 self._read_parameters(loaded_yaml) 

500 

501 # Process any named label subsets 

502 self._read_labeled_subsets(loaded_yaml) 

503 

504 # Process any inherited pipelines 

505 self._read_imports(loaded_yaml) 

506 

507 # verify named subsets, must be done after inheriting 

508 self._verify_labeled_subsets() 

509 

510 def _read_contracts(self, loaded_yaml): 

511 """Process the contracts portion of the loaded yaml document 

512 

513 Parameters 

514 --------- 

515 loaded_yaml : `dict` 

516 A dictionary which matches the structure that would be produced by 

517 a yaml reader which parses a pipeline definition document 

518 """ 

519 loaded_contracts = loaded_yaml.pop("contracts", []) 

520 if isinstance(loaded_contracts, str): 

521 loaded_contracts = [loaded_contracts] 

522 self.contracts = [] 

523 for contract in loaded_contracts: 

524 if isinstance(contract, dict): 

525 self.contracts.append(ContractIR(**contract)) 

526 if isinstance(contract, str): 

527 self.contracts.append(ContractIR(contract=contract)) 

528 

529 def _read_parameters(self, loaded_yaml): 

530 """Process the parameters portion of the loaded yaml document 

531 

532 Parameters 

533 --------- 

534 loaded_yaml : `dict` 

535 A dictionary which matches the structure that would be produced by 

536 a yaml reader which parses a pipeline definition document 

537 """ 

538 loaded_parameters = loaded_yaml.pop("parameters", {}) 

539 if not isinstance(loaded_parameters, dict): 

540 raise ValueError("The parameters section must be a yaml mapping") 

541 self.parameters = ParametersIR(loaded_parameters) 

542 

543 def _read_labeled_subsets(self, loaded_yaml: dict): 

544 """Process the subsets portion of the loaded yaml document 

545 

546 Parameters 

547 ---------- 

548 loaded_yaml: `MutableMapping` 

549 A dictionary which matches the structure that would be produced 

550 by a yaml reader which parses a pipeline definition document 

551 """ 

552 loaded_subsets = loaded_yaml.pop("subsets", {}) 

553 self.labeled_subsets = {} 

554 if not loaded_subsets and "subset" in loaded_yaml: 

555 raise ValueError("Top level key should be subsets and not subset, add an s") 

556 for key, value in loaded_subsets.items(): 

557 self.labeled_subsets[key] = LabeledSubset.from_primatives(key, value) 

558 

559 def _verify_labeled_subsets(self): 

560 """Verifies that all the labels in each named subset exist within the 

561 pipeline. 

562 """ 

563 # Verify that all labels defined in a labeled subset are in the 

564 # Pipeline 

565 for labeled_subset in self.labeled_subsets.values(): 

566 if not labeled_subset.subset.issubset(self.tasks.keys()): 

567 raise ValueError(f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

568 "declared pipeline") 

569 # Verify subset labels are not already task labels 

570 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

571 if label_intersection: 

572 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

573 

574 def _read_imports(self, loaded_yaml): 

575 """Process the inherits portion of the loaded yaml document 

576 

577 Parameters 

578 --------- 

579 loaded_yaml : `dict` 

580 A dictionary which matches the structure that would be produced by 

581 a yaml reader which parses a pipeline definition document 

582 """ 

583 def process_args(argument: Union[str, dict]) -> dict: 

584 if isinstance(argument, str): 

585 return {"location": argument} 

586 elif isinstance(argument, dict): 

587 if "exclude" in argument and isinstance(argument["exclude"], str): 

588 argument["exclude"] = [argument["exclude"]] 

589 if "include" in argument and isinstance(argument["include"], str): 

590 argument["include"] = [argument["include"]] 

591 if "instrument" in argument and argument["instrument"] == "None": 

592 argument["instrument"] = None 

593 return argument 

594 if not {"inherits", "imports"} - loaded_yaml.keys(): 

595 raise ValueError("Cannot define both inherits and imports sections, use imports") 

596 tmp_import = loaded_yaml.pop("inherits", None) 

597 if tmp_import is None: 

598 tmp_import = loaded_yaml.pop("imports", None) 

599 else: 

600 warnings.warn("The 'inherits' key is deprecated, and will be " 

601 "removed around June 2021. Please use the key " 

602 "'imports' instead") 

603 if tmp_import is None: 

604 self.imports = [] 

605 elif isinstance(tmp_import, list): 

606 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

607 else: 

608 self.imports = [ImportIR(**process_args(tmp_import))] 

609 

610 # integrate any imported pipelines 

611 accumulate_tasks = {} 

612 accumulate_labeled_subsets = {} 

613 accumulated_parameters = ParametersIR({}) 

614 for other_pipeline in self.imports: 

615 tmp_IR = other_pipeline.toPipelineIR() 

616 if self.instrument is None: 

617 self.instrument = tmp_IR.instrument 

618 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

619 raise ValueError("Only one instrument can be declared in a pipeline or it's imports") 

620 if accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

621 raise ValueError("Task labels in the imported pipelines must " 

622 "be unique") 

623 accumulate_tasks.update(tmp_IR.tasks) 

624 self.contracts.extend(tmp_IR.contracts) 

625 # verify that tmp_IR has unique labels for named subset among 

626 # existing labeled subsets, and with existing task labels. 

627 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

628 task_subset_overlap = ((accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys()) 

629 & accumulate_tasks.keys()) 

630 if overlapping_subsets or task_subset_overlap: 

631 raise ValueError("Labeled subset names must be unique amongst imports in both labels and " 

632 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}") 

633 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

634 accumulated_parameters.update(tmp_IR.parameters) 

635 

636 # verify that any accumulated labeled subsets dont clash with a label 

637 # from this pipeline 

638 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

639 raise ValueError("Labeled subset names must be unique amongst imports in both labels and " 

640 " named Subsets") 

641 # merge in the named subsets for self so this document can override any 

642 # that have been delcared 

643 accumulate_labeled_subsets.update(self.labeled_subsets) 

644 self.labeled_subsets = accumulate_labeled_subsets 

645 

646 # merge the dict of label:TaskIR objects, preserving any configs in the 

647 # imported pipeline if the labels point to the same class 

648 for label, task in self.tasks.items(): 

649 if label not in accumulate_tasks: 

650 accumulate_tasks[label] = task 

651 elif accumulate_tasks[label].klass == task.klass: 

652 if task.config is not None: 

653 for config in task.config: 

654 accumulate_tasks[label].add_or_update_config(config) 

655 else: 

656 accumulate_tasks[label] = task 

657 self.tasks = accumulate_tasks 

658 self.parameters.update(accumulated_parameters) 

659 

660 def _read_tasks(self, loaded_yaml): 

661 """Process the tasks portion of the loaded yaml document 

662 

663 Parameters 

664 --------- 

665 loaded_yaml : `dict` 

666 A dictionary which matches the structure that would be produced by 

667 a yaml reader which parses a pipeline definition document 

668 """ 

669 self.tasks = {} 

670 tmp_tasks = loaded_yaml.pop("tasks", None) 

671 if tmp_tasks is None: 

672 tmp_tasks = {} 

673 

674 if "parameters" in tmp_tasks: 

675 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

676 

677 for label, definition in tmp_tasks.items(): 

678 if isinstance(definition, str): 

679 definition = {"class": definition} 

680 config = definition.get('config', None) 

681 if config is None: 

682 task_config_ir = None 

683 else: 

684 if isinstance(config, dict): 

685 config = [config] 

686 task_config_ir = [] 

687 for c in config: 

688 file = c.pop("file", None) 

689 if file is None: 

690 file = [] 

691 elif not isinstance(file, list): 

692 file = [file] 

693 task_config_ir.append(ConfigIR(python=c.pop("python", None), 

694 dataId=c.pop("dataId", None), 

695 file=file, 

696 rest=c)) 

697 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

698 

699 def _remove_contracts(self, label: str): 

700 """Remove any contracts that contain the given label 

701 

702 String comparison used in this way is not the most elegant and may 

703 have issues, but it is the only feasible way when users can specify 

704 contracts with generic strings. 

705 """ 

706 new_contracts = [] 

707 for contract in self.contracts: 

708 # match a label that is not preceded by an ASCII identifier, or 

709 # is the start of a line and is followed by a dot 

710 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

711 continue 

712 new_contracts.append(contract) 

713 self.contracts = new_contracts 

714 

715 def subset_from_labels(self, labelSpecifier: Set[str]) -> PipelineIR: 

716 """Subset a pipelineIR to contain only labels specified in 

717 labelSpecifier. 

718 

719 Parameters 

720 ---------- 

721 labelSpecifier : `set` of `str` 

722 Set containing labels that describes how to subset a pipeline. 

723 

724 Returns 

725 ------- 

726 pipeline : `PipelineIR` 

727 A new pipelineIR object that is a subset of the old pipelineIR 

728 

729 Raises 

730 ------ 

731 ValueError 

732 Raised if there is an issue with specified labels 

733 

734 Notes 

735 ----- 

736 This method attempts to prune any contracts that contain labels which 

737 are not in the declared subset of labels. This pruning is done using a 

738 string based matching due to the nature of contracts and may prune more 

739 than it should. Any labeled subsets defined that no longer have all 

740 members of the subset present in the pipeline will be removed from the 

741 resulting pipeline. 

742 """ 

743 

744 pipeline = copy.deepcopy(self) 

745 

746 # update the label specifier to expand any named subsets 

747 toRemove = set() 

748 toAdd = set() 

749 for label in labelSpecifier: 

750 if label in pipeline.labeled_subsets: 

751 toRemove.add(label) 

752 toAdd.update(pipeline.labeled_subsets[label].subset) 

753 labelSpecifier.difference_update(toRemove) 

754 labelSpecifier.update(toAdd) 

755 # verify all the labels are in the pipeline 

756 if not labelSpecifier.issubset(pipeline.tasks.keys() 

757 | pipeline.labeled_subsets): 

758 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

759 raise ValueError("Not all supplied labels (specified or named subsets) are in the pipeline " 

760 f"definition, extra labels: {difference}") 

761 # copy needed so as to not modify while iterating 

762 pipeline_labels = set(pipeline.tasks.keys()) 

763 # Remove the labels from the pipelineIR, and any contracts that contain 

764 # those labels (see docstring on _remove_contracts for why this may 

765 # cause issues) 

766 for label in pipeline_labels: 

767 if label not in labelSpecifier: 

768 pipeline.tasks.pop(label) 

769 pipeline._remove_contracts(label) 

770 

771 # create a copy of the object to iterate over 

772 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

773 # remove any labeled subsets that no longer have a complete set 

774 for label, labeled_subset in labeled_subsets.items(): 

775 if labeled_subset.subset - pipeline.tasks.keys(): 

776 pipeline.labeled_subsets.pop(label) 

777 

778 return pipeline 

779 

780 @classmethod 

781 def from_string(cls, pipeline_string: str): 

782 """Create a `PipelineIR` object from a string formatted like a pipeline 

783 document 

784 

785 Parameters 

786 ---------- 

787 pipeline_string : `str` 

788 A string that is formatted according like a pipeline document 

789 """ 

790 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

791 return cls(loaded_yaml) 

792 

793 @classmethod 

794 def from_file(cls, filename: str): 

795 """Create a `PipelineIR` object from the document specified by the 

796 input path. 

797 

798 Parameters 

799 ---------- 

800 filename : `str` 

801 Location of document to use in creating a `PipelineIR` object. 

802 """ 

803 with open(filename, 'r') as f: 

804 loaded_yaml = yaml.load(f, Loader=PipelineYamlLoader) 

805 return cls(loaded_yaml) 

806 

807 def to_file(self, filename: str): 

808 """Serialize this `PipelineIR` object into a yaml formatted string and 

809 write the output to a file at the specified path. 

810 

811 Parameters 

812 ---------- 

813 filename : `str` 

814 Location of document to write a `PipelineIR` object. 

815 """ 

816 with open(filename, 'w') as f: 

817 yaml.dump(self.to_primitives(), f, sort_keys=False) 

818 

819 def to_primitives(self): 

820 """Convert to a representation used in yaml serialization 

821 """ 

822 accumulate = {"description": self.description} 

823 if self.instrument is not None: 

824 accumulate['instrument'] = self.instrument 

825 if self.parameters: 

826 accumulate['parameters'] = self.parameters.to_primitives() 

827 accumulate['tasks'] = {m: t.to_primitives() for m, t in self.tasks.items()} 

828 if len(self.contracts) > 0: 

829 accumulate['contracts'] = [c.to_primitives() for c in self.contracts] 

830 if self.labeled_subsets: 

831 accumulate['subsets'] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

832 return accumulate 

833 

834 def __str__(self) -> str: 

835 """Instance formatting as how it would look in yaml representation 

836 """ 

837 return yaml.dump(self.to_primitives(), sort_keys=False) 

838 

839 def __repr__(self) -> str: 

840 """Instance formatting as how it would look in yaml representation 

841 """ 

842 return str(self) 

843 

844 def __eq__(self, other: "PipelineIR"): 

845 if not isinstance(other, PipelineIR): 

846 return False 

847 elif all(getattr(self, attr) == getattr(other, attr) for attr in 

848 ("contracts", "tasks", "instrument")): 

849 return True 

850 else: 

851 return False