Coverage for python/lsst/pipe/base/pipelineIR.py: 19%

396 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-25 09:14 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ConfigIR", 

25 "ContractError", 

26 "ContractIR", 

27 "ImportIR", 

28 "LabeledSubset", 

29 "ParametersIR", 

30 "PipelineIR", 

31 "TaskIR", 

32) 

33 

34import copy 

35import enum 

36import os 

37import re 

38import warnings 

39from collections import Counter 

40from collections.abc import Generator, Hashable, Iterable, MutableMapping 

41from dataclasses import dataclass, field 

42from typing import Any, Literal 

43 

44import yaml 

45from lsst.resources import ResourcePath, ResourcePathExpression 

46 

47 

48class _Tags(enum.Enum): 

49 KeepInstrument = enum.auto() 

50 

51 

52class PipelineYamlLoader(yaml.SafeLoader): 

53 """Specialized version of yaml's SafeLoader. 

54 

55 It checks and raises an exception if it finds that there are multiple 

56 instances of the same key found inside a pipeline file at a given scope. 

57 """ 

58 

59 def construct_mapping(self, node: yaml.MappingNode, deep: bool = False) -> dict[Hashable, Any]: 

60 # do the call to super first so that it can do all the other forms of 

61 # checking on this node. If you check the uniqueness of keys first 

62 # it would save the work that super does in the case of a failure, but 

63 # it might fail in the case that the node was the incorrect node due 

64 # to a parsing error, and the resulting exception would be difficult to 

65 # understand. 

66 mapping = super().construct_mapping(node, deep) 

67 # Check if there are any duplicate keys 

68 all_keys = Counter(key_node.value for key_node, _ in node.value) 

69 duplicates = {k for k, i in all_keys.items() if i != 1} 

70 if duplicates: 

71 raise KeyError( 

72 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

73 ) 

74 return mapping 

75 

76 

77class MultilineStringDumper(yaml.Dumper): 

78 """Custom YAML dumper that makes multi-line strings use the '|' 

79 continuation style instead of unreadable newlines and tons of quotes. 

80 

81 Basic approach is taken from 

82 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

83 but is written as a Dumper subclass to make its effects non-global (vs 

84 `yaml.add_representer`). 

85 """ 

86 

87 def represent_scalar(self, tag: str, value: Any, style: str | None = None) -> yaml.ScalarNode: 

88 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

89 style = "|" 

90 return super().represent_scalar(tag, value, style) 

91 

92 

93class ContractError(Exception): 

94 """An exception that is raised when a pipeline contract is not satisfied""" 

95 

96 pass 

97 

98 

99@dataclass 

100class ContractIR: 

101 """Intermediate representation of configuration contracts read from a 

102 pipeline yaml file. 

103 """ 

104 

105 contract: str 

106 """A string of python code representing one or more conditions on configs 

107 in a pipeline. This code-as-string should, once evaluated, should be True 

108 if the configs are fine, and False otherwise. 

109 """ 

110 msg: str | None = None 

111 """An optional message to be shown to the user if a contract fails 

112 """ 

113 

114 def to_primitives(self) -> dict[str, str]: 

115 """Convert to a representation used in yaml serialization""" 

116 accumulate = {"contract": self.contract} 

117 if self.msg is not None: 

118 accumulate["msg"] = self.msg 

119 return accumulate 

120 

121 def __eq__(self, other: object) -> bool: 

122 if not isinstance(other, ContractIR): 

123 return False 

124 elif self.contract == other.contract and self.msg == other.msg: 

125 return True 

126 else: 

127 return False 

128 

129 

130@dataclass 

131class LabeledSubset: 

132 """Intermediate representation of named subset of task labels read from 

133 a pipeline yaml file. 

134 """ 

135 

136 label: str 

137 """The label used to identify the subset of task labels. 

138 """ 

139 subset: set[str] 

140 """A set of task labels contained in this subset. 

141 """ 

142 description: str | None 

143 """A description of what this subset of tasks is intended to do 

144 """ 

145 

146 @staticmethod 

147 def from_primitives(label: str, value: list[str] | dict) -> LabeledSubset: 

148 """Generate `LabeledSubset` objects given a properly formatted object 

149 that as been created by a yaml loader. 

150 

151 Parameters 

152 ---------- 

153 label : `str` 

154 The label that will be used to identify this labeled subset. 

155 value : `list` of `str` or `dict` 

156 Object returned from loading a labeled subset section from a yaml 

157 document. 

158 

159 Returns 

160 ------- 

161 labeledSubset : `LabeledSubset` 

162 A `LabeledSubset` object build from the inputs. 

163 

164 Raises 

165 ------ 

166 ValueError 

167 Raised if the value input is not properly formatted for parsing 

168 """ 

169 if isinstance(value, MutableMapping): 

170 subset = value.pop("subset", None) 

171 if subset is None: 

172 raise ValueError( 

173 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

174 ) 

175 description = value.pop("description", None) 

176 elif isinstance(value, Iterable): 

177 subset = value 

178 description = None 

179 else: 

180 raise ValueError( 

181 f"There was a problem parsing the labeled subset {label}, make sure the " 

182 "definition is either a valid yaml list, or a mapping with keys " 

183 "(subset, description) where subset points to a yaml list, and description is " 

184 "associated with a string" 

185 ) 

186 return LabeledSubset(label, set(subset), description) 

187 

188 def to_primitives(self) -> dict[str, list[str] | str]: 

189 """Convert to a representation used in yaml serialization.""" 

190 accumulate: dict[str, list[str] | str] = {"subset": list(self.subset)} 

191 if self.description is not None: 

192 accumulate["description"] = self.description 

193 return accumulate 

194 

195 

196@dataclass 

197class ParametersIR: # noqa: D405,D406,D407,D214 ("parameters" in code block) 

198 """Intermediate representation of parameters that are global to a pipeline. 

199 

200 Notes 

201 ----- 

202 These parameters are specified under a top level key named ``parameters`` 

203 and are declared as a yaml mapping. These entries can then be used inside 

204 task configuration blocks to specify configuration values. They may not be 

205 used in the special ``file`` or ``python`` blocks. 

206 

207 Examples 

208 -------- 

209 .. code-block:: yaml 

210 

211 parameters: 

212 shared_value: 14 

213 tasks: 

214 taskA: 

215 class: modA 

216 config: 

217 field1: parameters.shared_value 

218 taskB: 

219 class: modB 

220 config: 

221 field2: parameters.shared_value 

222 """ 

223 

224 mapping: MutableMapping[str, str] 

225 """A mutable mapping of identifiers as keys, and shared configuration 

226 as values. 

227 """ 

228 

229 def update(self, other: ParametersIR | None) -> None: 

230 if other is not None: 

231 self.mapping.update(other.mapping) 

232 

233 def to_primitives(self) -> MutableMapping[str, str]: 

234 """Convert to a representation used in yaml serialization""" 

235 return self.mapping 

236 

237 def __contains__(self, value: str) -> bool: 

238 return value in self.mapping 

239 

240 def __getitem__(self, item: str) -> Any: 

241 return self.mapping[item] 

242 

243 def __bool__(self) -> bool: 

244 return bool(self.mapping) 

245 

246 

247@dataclass 

248class ConfigIR: 

249 """Intermediate representation of configurations read from a pipeline yaml 

250 file. 

251 """ 

252 

253 python: str | None = None 

254 """A string of python code that is used to modify a configuration. This can 

255 also be None if there are no modifications to do. 

256 """ 

257 dataId: dict | None = None 

258 """A dataId that is used to constrain these config overrides to only quanta 

259 with matching dataIds. This field can be None if there is no constraint. 

260 This is currently an unimplemented feature, and is placed here for future 

261 use. 

262 """ 

263 file: list[str] = field(default_factory=list) 

264 """A list of paths which points to a file containing config overrides to be 

265 applied. This value may be an empty list if there are no overrides to 

266 apply. 

267 """ 

268 rest: dict = field(default_factory=dict) 

269 """This is a dictionary of key value pairs, where the keys are strings 

270 corresponding to qualified fields on a config to override, and the values 

271 are strings representing the values to apply. 

272 """ 

273 

274 def to_primitives(self) -> dict[str, str | dict | list[str]]: 

275 """Convert to a representation used in yaml serialization""" 

276 accumulate = {} 

277 for name in ("python", "dataId", "file"): 

278 # if this attribute is thruthy add it to the accumulation 

279 # dictionary 

280 if getattr(self, name): 

281 accumulate[name] = getattr(self, name) 

282 # Add the dictionary containing the rest of the config keys to the 

283 # # accumulated dictionary 

284 accumulate.update(self.rest) 

285 return accumulate 

286 

287 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

288 """Return a new ConfigIR object that is formatted according to the 

289 specified parameters 

290 

291 Parameters 

292 ---------- 

293 parameters : `ParametersIR` 

294 Object that contains variable mappings used in substitution. 

295 

296 Returns 

297 ------- 

298 config : `ConfigIR` 

299 A new ConfigIR object formatted with the input parameters 

300 """ 

301 new_config = copy.deepcopy(self) 

302 for key, value in new_config.rest.items(): 

303 if not isinstance(value, str): 

304 continue 

305 match = re.match("parameters[.](.*)", value) 

306 if match and match.group(1) in parameters: 

307 new_config.rest[key] = parameters[match.group(1)] 

308 if match and match.group(1) not in parameters: 

309 warnings.warn( 

310 f"config {key} contains value {match.group(0)} which is formatted like a " 

311 "Pipeline parameter but was not found within the Pipeline, if this was not " 

312 "intentional, check for a typo" 

313 ) 

314 return new_config 

315 

316 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

317 """Merge another instance of a `ConfigIR` into this instance if 

318 possible. This function returns a generator that is either self 

319 if the configs were merged, or self, and other_config if that could 

320 not be merged. 

321 

322 Parameters 

323 ---------- 

324 other_config : `ConfigIR` 

325 An instance of `ConfigIR` to merge into this instance. 

326 

327 Returns 

328 ------- 

329 Generator : `ConfigIR` 

330 A generator containing either self, or self and other_config if 

331 the configs could be merged or not respectively. 

332 """ 

333 # Verify that the config blocks can be merged 

334 if ( 

335 self.dataId != other_config.dataId 

336 or self.python 

337 or other_config.python 

338 or self.file 

339 or other_config.file 

340 ): 

341 yield from (self, other_config) 

342 return 

343 

344 # create a set of all keys, and verify two keys do not have different 

345 # values 

346 key_union = self.rest.keys() & other_config.rest.keys() 

347 for key in key_union: 

348 if self.rest[key] != other_config.rest[key]: 

349 yield from (self, other_config) 

350 return 

351 self.rest.update(other_config.rest) 

352 

353 # Combine the lists of override files to load 

354 self_file_set = set(self.file) 

355 other_file_set = set(other_config.file) 

356 self.file = list(self_file_set.union(other_file_set)) 

357 

358 yield self 

359 

360 def __eq__(self, other: object) -> bool: 

361 if not isinstance(other, ConfigIR): 

362 return False 

363 elif all( 

364 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

365 ): 

366 return True 

367 else: 

368 return False 

369 

370 

371@dataclass 

372class TaskIR: 

373 """Intermediate representation of tasks read from a pipeline yaml file.""" 

374 

375 label: str 

376 """An identifier used to refer to a task. 

377 """ 

378 klass: str 

379 """A string containing a fully qualified python class to be run in a 

380 pipeline. 

381 """ 

382 config: list[ConfigIR] | None = None 

383 """list of all configs overrides associated with this task, and may be 

384 `None` if there are no config overrides. 

385 """ 

386 

387 def to_primitives(self) -> dict[str, str | list[dict]]: 

388 """Convert to a representation used in yaml serialization""" 

389 accumulate: dict[str, str | list[dict]] = {"class": self.klass} 

390 if self.config: 

391 accumulate["config"] = [c.to_primitives() for c in self.config] 

392 return accumulate 

393 

394 def add_or_update_config(self, other_config: ConfigIR) -> None: 

395 """Add a `ConfigIR` to this task if one is not present. Merges configs 

396 if there is a `ConfigIR` present and the dataId keys of both configs 

397 match, otherwise adds a new entry to the config list. The exception to 

398 the above is that if either the last config or other_config has a 

399 python block, then other_config is always added, as python blocks can 

400 modify configs in ways that cannot be predicted. 

401 

402 Parameters 

403 ---------- 

404 other_config : `ConfigIR` 

405 A `ConfigIR` instance to add or merge into the config attribute of 

406 this task. 

407 """ 

408 if not self.config: 

409 self.config = [other_config] 

410 return 

411 self.config.extend(self.config.pop().maybe_merge(other_config)) 

412 

413 def __eq__(self, other: object) -> bool: 

414 if not isinstance(other, TaskIR): 

415 return False 

416 elif all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")): 

417 return True 

418 else: 

419 return False 

420 

421 

422@dataclass 

423class ImportIR: 

424 """An intermediate representation of imported pipelines""" 

425 

426 location: str 

427 """This is the location of the pipeline to inherit. The path should be 

428 specified as an absolute path. Environment variables may be used in the 

429 path and should be specified as a python string template, with the name of 

430 the environment variable inside braces. 

431 """ 

432 include: list[str] | None = None 

433 """list of tasks that should be included when inheriting this pipeline. 

434 Either the include or exclude attributes may be specified, but not both. 

435 """ 

436 exclude: list[str] | None = None 

437 """list of tasks that should be excluded when inheriting this pipeline. 

438 Either the include or exclude attributes may be specified, but not both. 

439 """ 

440 importContracts: bool = True 

441 """Boolean attribute to dictate if contracts should be inherited with the 

442 pipeline or not. 

443 """ 

444 instrument: Literal[_Tags.KeepInstrument] | str | None = _Tags.KeepInstrument 

445 """Instrument to assign to the Pipeline at import. The default value of 

446 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

447 declared with will not be modified. setting this value to None will drop 

448 any declared instrument prior to import. 

449 """ 

450 

451 def toPipelineIR(self) -> "PipelineIR": 

452 """Load in the Pipeline specified by this object, and turn it into a 

453 PipelineIR instance. 

454 

455 Returns 

456 ------- 

457 pipeline : `PipelineIR` 

458 A pipeline generated from the imported pipeline file 

459 """ 

460 if self.include and self.exclude: 

461 raise ValueError( 

462 "An include list and an exclude list cannot both be specified" 

463 " when declaring a pipeline import." 

464 ) 

465 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

466 if self.instrument is not _Tags.KeepInstrument: 

467 tmp_pipeline.instrument = self.instrument 

468 

469 included_labels = set() 

470 for label in tmp_pipeline.tasks: 

471 if ( 

472 (self.include and label in self.include) 

473 or (self.exclude and label not in self.exclude) 

474 or (self.include is None and self.exclude is None) 

475 ): 

476 included_labels.add(label) 

477 

478 # Handle labeled subsets being specified in the include or exclude 

479 # list, adding or removing labels. 

480 if self.include is not None: 

481 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

482 for label in subsets_in_include: 

483 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

484 

485 elif self.exclude is not None: 

486 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

487 for label in subsets_in_exclude: 

488 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

489 

490 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels) 

491 

492 if not self.importContracts: 

493 tmp_pipeline.contracts = [] 

494 

495 return tmp_pipeline 

496 

497 def __eq__(self, other: object) -> bool: 

498 if not isinstance(other, ImportIR): 

499 return False 

500 elif all( 

501 getattr(self, attr) == getattr(other, attr) 

502 for attr in ("location", "include", "exclude", "importContracts") 

503 ): 

504 return True 

505 else: 

506 return False 

507 

508 

509class PipelineIR: 

510 """Intermediate representation of a pipeline definition 

511 

512 Parameters 

513 ---------- 

514 loaded_yaml : `dict` 

515 A dictionary which matches the structure that would be produced by a 

516 yaml reader which parses a pipeline definition document 

517 

518 Raises 

519 ------ 

520 ValueError 

521 Raised if: 

522 

523 - a pipeline is declared without a description; 

524 - no tasks are declared in a pipeline, and no pipelines are to be 

525 inherited; 

526 - more than one instrument is specified; 

527 - more than one inherited pipeline share a label. 

528 """ 

529 

530 def __init__(self, loaded_yaml: dict[str, Any]): 

531 # Check required fields are present 

532 if "description" not in loaded_yaml: 

533 raise ValueError("A pipeline must be declared with a description") 

534 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

535 raise ValueError("A pipeline must be declared with one or more tasks") 

536 

537 # These steps below must happen in this call order 

538 

539 # Process pipeline description 

540 self.description = loaded_yaml.pop("description") 

541 

542 # Process tasks 

543 self._read_tasks(loaded_yaml) 

544 

545 # Process instrument keys 

546 inst = loaded_yaml.pop("instrument", None) 

547 if isinstance(inst, list): 

548 raise ValueError("Only one top level instrument can be defined in a pipeline") 

549 self.instrument: str | None = inst 

550 

551 # Process any contracts 

552 self._read_contracts(loaded_yaml) 

553 

554 # Process any defined parameters 

555 self._read_parameters(loaded_yaml) 

556 

557 # Process any named label subsets 

558 self._read_labeled_subsets(loaded_yaml) 

559 

560 # Process any inherited pipelines 

561 self._read_imports(loaded_yaml) 

562 

563 # verify named subsets, must be done after inheriting 

564 self._verify_labeled_subsets() 

565 

566 def _read_contracts(self, loaded_yaml: dict[str, Any]) -> None: 

567 """Process the contracts portion of the loaded yaml document 

568 

569 Parameters 

570 ---------- 

571 loaded_yaml : `dict` 

572 A dictionary which matches the structure that would be produced by 

573 a yaml reader which parses a pipeline definition document 

574 """ 

575 loaded_contracts = loaded_yaml.pop("contracts", []) 

576 if isinstance(loaded_contracts, str): 

577 loaded_contracts = [loaded_contracts] 

578 self.contracts: list[ContractIR] = [] 

579 for contract in loaded_contracts: 

580 if isinstance(contract, dict): 

581 self.contracts.append(ContractIR(**contract)) 

582 if isinstance(contract, str): 

583 self.contracts.append(ContractIR(contract=contract)) 

584 

585 def _read_parameters(self, loaded_yaml: dict[str, Any]) -> None: 

586 """Process the parameters portion of the loaded yaml document 

587 

588 Parameters 

589 ---------- 

590 loaded_yaml : `dict` 

591 A dictionary which matches the structure that would be produced by 

592 a yaml reader which parses a pipeline definition document 

593 """ 

594 loaded_parameters = loaded_yaml.pop("parameters", {}) 

595 if not isinstance(loaded_parameters, dict): 

596 raise ValueError("The parameters section must be a yaml mapping") 

597 self.parameters = ParametersIR(loaded_parameters) 

598 

599 def _read_labeled_subsets(self, loaded_yaml: dict[str, Any]) -> None: 

600 """Process the subsets portion of the loaded yaml document 

601 

602 Parameters 

603 ---------- 

604 loaded_yaml: `MutableMapping` 

605 A dictionary which matches the structure that would be produced 

606 by a yaml reader which parses a pipeline definition document 

607 """ 

608 loaded_subsets = loaded_yaml.pop("subsets", {}) 

609 self.labeled_subsets: dict[str, LabeledSubset] = {} 

610 if not loaded_subsets and "subset" in loaded_yaml: 

611 raise ValueError("Top level key should be subsets and not subset, add an s") 

612 for key, value in loaded_subsets.items(): 

613 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

614 

615 def _verify_labeled_subsets(self) -> None: 

616 """Verify that all the labels in each named subset exist within the 

617 pipeline. 

618 """ 

619 # Verify that all labels defined in a labeled subset are in the 

620 # Pipeline 

621 for labeled_subset in self.labeled_subsets.values(): 

622 if not labeled_subset.subset.issubset(self.tasks.keys()): 

623 raise ValueError( 

624 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

625 "declared pipeline" 

626 ) 

627 # Verify subset labels are not already task labels 

628 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

629 if label_intersection: 

630 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

631 

632 def _read_imports(self, loaded_yaml: dict[str, Any]) -> None: 

633 """Process the inherits portion of the loaded yaml document 

634 

635 Parameters 

636 ---------- 

637 loaded_yaml : `dict` 

638 A dictionary which matches the structure that would be produced by 

639 a yaml reader which parses a pipeline definition document 

640 """ 

641 

642 def process_args(argument: str | dict) -> dict: 

643 if isinstance(argument, str): 

644 return {"location": argument} 

645 elif isinstance(argument, dict): 

646 if "exclude" in argument and isinstance(argument["exclude"], str): 

647 argument["exclude"] = [argument["exclude"]] 

648 if "include" in argument and isinstance(argument["include"], str): 

649 argument["include"] = [argument["include"]] 

650 if "instrument" in argument and argument["instrument"] == "None": 

651 argument["instrument"] = None 

652 return argument 

653 

654 if not {"inherits", "imports"} - loaded_yaml.keys(): 

655 raise ValueError("Cannot define both inherits and imports sections, use imports") 

656 tmp_import = loaded_yaml.pop("inherits", None) 

657 if tmp_import is None: 

658 tmp_import = loaded_yaml.pop("imports", None) 

659 else: 

660 raise ValueError("The 'inherits' key is not supported. Please use the key 'imports' instead") 

661 if tmp_import is None: 

662 self.imports: list[ImportIR] = [] 

663 elif isinstance(tmp_import, list): 

664 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

665 else: 

666 self.imports = [ImportIR(**process_args(tmp_import))] 

667 

668 self.merge_pipelines([fragment.toPipelineIR() for fragment in self.imports]) 

669 

670 def merge_pipelines(self, pipelines: Iterable[PipelineIR]) -> None: 

671 """Merge one or more other `PipelineIR` objects into this object. 

672 

673 Parameters 

674 ---------- 

675 pipelines : `~collections.abc.Iterable` of `PipelineIR` objects 

676 An `~collections.abc.Iterable` that contains one or more 

677 `PipelineIR` objects to merge into this object. 

678 

679 Raises 

680 ------ 

681 ValueError 

682 Raised if there is a conflict in instrument specifications. 

683 Raised if a task label appears in more than one of the input 

684 `PipelineIR` objects which are to be merged. 

685 Raised if a labeled subset appears in more than one of the input 

686 `PipelineIR` objects which are to be merged, and with any subset 

687 existing in this object. 

688 """ 

689 # integrate any imported pipelines 

690 accumulate_tasks: dict[str, TaskIR] = {} 

691 accumulate_labeled_subsets: dict[str, LabeledSubset] = {} 

692 accumulated_parameters = ParametersIR({}) 

693 

694 for tmp_IR in pipelines: 

695 if self.instrument is None: 

696 self.instrument = tmp_IR.instrument 

697 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

698 msg = ( 

699 "Only one instrument can be declared in a pipeline or its imports. " 

700 f"Top level pipeline defines {self.instrument} but pipeline to merge " 

701 f"defines {tmp_IR.instrument}." 

702 ) 

703 raise ValueError(msg) 

704 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

705 msg = ( 

706 "Task labels in the imported pipelines must be unique. " 

707 f"These labels appear multiple times: {duplicate_labels}" 

708 ) 

709 raise ValueError(msg) 

710 accumulate_tasks.update(tmp_IR.tasks) 

711 self.contracts.extend(tmp_IR.contracts) 

712 # verify that tmp_IR has unique labels for named subset among 

713 # existing labeled subsets, and with existing task labels. 

714 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

715 task_subset_overlap = ( 

716 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

717 ) & accumulate_tasks.keys() 

718 if overlapping_subsets or task_subset_overlap: 

719 raise ValueError( 

720 "Labeled subset names must be unique amongst imports in both labels and " 

721 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

722 ) 

723 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

724 accumulated_parameters.update(tmp_IR.parameters) 

725 

726 # verify that any accumulated labeled subsets dont clash with a label 

727 # from this pipeline 

728 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

729 raise ValueError( 

730 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

731 ) 

732 # merge in the named subsets for self so this document can override any 

733 # that have been delcared 

734 accumulate_labeled_subsets.update(self.labeled_subsets) 

735 self.labeled_subsets = accumulate_labeled_subsets 

736 

737 # merge the dict of label:TaskIR objects, preserving any configs in the 

738 # imported pipeline if the labels point to the same class 

739 for label, task in self.tasks.items(): 

740 if label not in accumulate_tasks: 

741 accumulate_tasks[label] = task 

742 elif accumulate_tasks[label].klass == task.klass: 

743 if task.config is not None: 

744 for config in task.config: 

745 accumulate_tasks[label].add_or_update_config(config) 

746 else: 

747 accumulate_tasks[label] = task 

748 self.tasks: dict[str, TaskIR] = accumulate_tasks 

749 accumulated_parameters.update(self.parameters) 

750 self.parameters = accumulated_parameters 

751 

752 def _read_tasks(self, loaded_yaml: dict[str, Any]) -> None: 

753 """Process the tasks portion of the loaded yaml document 

754 

755 Parameters 

756 ---------- 

757 loaded_yaml : `dict` 

758 A dictionary which matches the structure that would be produced by 

759 a yaml reader which parses a pipeline definition document 

760 """ 

761 self.tasks = {} 

762 tmp_tasks = loaded_yaml.pop("tasks", None) 

763 if tmp_tasks is None: 

764 tmp_tasks = {} 

765 

766 if "parameters" in tmp_tasks: 

767 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

768 

769 for label, definition in tmp_tasks.items(): 

770 if isinstance(definition, str): 

771 definition = {"class": definition} 

772 config = definition.get("config", None) 

773 if config is None: 

774 task_config_ir = None 

775 else: 

776 if isinstance(config, dict): 

777 config = [config] 

778 task_config_ir = [] 

779 for c in config: 

780 file = c.pop("file", None) 

781 if file is None: 

782 file = [] 

783 elif not isinstance(file, list): 

784 file = [file] 

785 task_config_ir.append( 

786 ConfigIR( 

787 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

788 ) 

789 ) 

790 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

791 

792 def _remove_contracts(self, label: str) -> None: 

793 """Remove any contracts that contain the given label 

794 

795 String comparison used in this way is not the most elegant and may 

796 have issues, but it is the only feasible way when users can specify 

797 contracts with generic strings. 

798 """ 

799 new_contracts = [] 

800 for contract in self.contracts: 

801 # match a label that is not preceded by an ASCII identifier, or 

802 # is the start of a line and is followed by a dot 

803 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

804 continue 

805 new_contracts.append(contract) 

806 self.contracts = new_contracts 

807 

808 def subset_from_labels(self, labelSpecifier: set[str]) -> PipelineIR: 

809 """Subset a pipelineIR to contain only labels specified in 

810 labelSpecifier. 

811 

812 Parameters 

813 ---------- 

814 labelSpecifier : `set` of `str` 

815 set containing labels that describes how to subset a pipeline. 

816 

817 Returns 

818 ------- 

819 pipeline : `PipelineIR` 

820 A new pipelineIR object that is a subset of the old pipelineIR 

821 

822 Raises 

823 ------ 

824 ValueError 

825 Raised if there is an issue with specified labels 

826 

827 Notes 

828 ----- 

829 This method attempts to prune any contracts that contain labels which 

830 are not in the declared subset of labels. This pruning is done using a 

831 string based matching due to the nature of contracts and may prune more 

832 than it should. Any labeled subsets defined that no longer have all 

833 members of the subset present in the pipeline will be removed from the 

834 resulting pipeline. 

835 """ 

836 pipeline = copy.deepcopy(self) 

837 

838 # update the label specifier to expand any named subsets 

839 toRemove = set() 

840 toAdd = set() 

841 for label in labelSpecifier: 

842 if label in pipeline.labeled_subsets: 

843 toRemove.add(label) 

844 toAdd.update(pipeline.labeled_subsets[label].subset) 

845 labelSpecifier.difference_update(toRemove) 

846 labelSpecifier.update(toAdd) 

847 # verify all the labels are in the pipeline 

848 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

849 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

850 raise ValueError( 

851 "Not all supplied labels (specified or named subsets) are in the pipeline " 

852 f"definition, extra labels: {difference}" 

853 ) 

854 # copy needed so as to not modify while iterating 

855 pipeline_labels = set(pipeline.tasks.keys()) 

856 # Remove the labels from the pipelineIR, and any contracts that contain 

857 # those labels (see docstring on _remove_contracts for why this may 

858 # cause issues) 

859 for label in pipeline_labels: 

860 if label not in labelSpecifier: 

861 pipeline.tasks.pop(label) 

862 pipeline._remove_contracts(label) 

863 

864 # create a copy of the object to iterate over 

865 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

866 # remove any labeled subsets that no longer have a complete set 

867 for label, labeled_subset in labeled_subsets.items(): 

868 if labeled_subset.subset - pipeline.tasks.keys(): 

869 pipeline.labeled_subsets.pop(label) 

870 

871 return pipeline 

872 

873 @classmethod 

874 def from_string(cls, pipeline_string: str) -> PipelineIR: 

875 """Create a `PipelineIR` object from a string formatted like a pipeline 

876 document 

877 

878 Parameters 

879 ---------- 

880 pipeline_string : `str` 

881 A string that is formatted according like a pipeline document 

882 """ 

883 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

884 return cls(loaded_yaml) 

885 

886 @classmethod 

887 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

888 """Create a `PipelineIR` object from the document specified by the 

889 input uri. 

890 

891 Parameters 

892 ---------- 

893 uri: convertible to `~lsst.resources.ResourcePath` 

894 Location of document to use in creating a `PipelineIR` object. 

895 

896 Returns 

897 ------- 

898 pipelineIR : `PipelineIR` 

899 The loaded pipeline 

900 """ 

901 loaded_uri = ResourcePath(uri) 

902 with loaded_uri.open("r") as buffer: 

903 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader) 

904 return cls(loaded_yaml) 

905 

906 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

907 """Serialize this `PipelineIR` object into a yaml formatted string and 

908 write the output to a file at the specified uri. 

909 

910 Parameters 

911 ---------- 

912 uri: convertible to `~lsst.resources.ResourcePath` 

913 Location of document to write a `PipelineIR` object. 

914 """ 

915 with ResourcePath(uri).open("w") as buffer: 

916 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

917 

918 def to_primitives(self) -> dict[str, Any]: 

919 """Convert to a representation used in yaml serialization 

920 

921 Returns 

922 ------- 

923 primitives : `dict` 

924 dictionary that maps directly to the serialized YAML form. 

925 """ 

926 accumulate = {"description": self.description} 

927 if self.instrument is not None: 

928 accumulate["instrument"] = self.instrument 

929 if self.parameters: 

930 accumulate["parameters"] = self.parameters.to_primitives() 

931 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

932 if len(self.contracts) > 0: 

933 # sort contracts lexicographical order by the contract string in 

934 # absence of any other ordering principle 

935 contracts_list = [c.to_primitives() for c in self.contracts] 

936 contracts_list.sort(key=lambda x: x["contract"]) 

937 accumulate["contracts"] = contracts_list 

938 if self.labeled_subsets: 

939 accumulate["subsets"] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

940 return accumulate 

941 

942 def __str__(self) -> str: 

943 """Instance formatting as how it would look in yaml representation""" 

944 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

945 

946 def __repr__(self) -> str: 

947 """Instance formatting as how it would look in yaml representation""" 

948 return str(self) 

949 

950 def __eq__(self, other: object) -> bool: 

951 if not isinstance(other, PipelineIR): 

952 return False 

953 # special case contracts because it is a list, but order is not 

954 # important 

955 elif ( 

956 all( 

957 getattr(self, attr) == getattr(other, attr) 

958 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

959 ) 

960 and len(self.contracts) == len(other.contracts) 

961 and all(c in self.contracts for c in other.contracts) 

962 ): 

963 return True 

964 else: 

965 return False