Coverage for python/lsst/pipe/base/pipelineIR.py: 19%

396 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-16 09:02 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("ConfigIR", "ContractError", "ContractIR", "ImportIR", "PipelineIR", "TaskIR", "LabeledSubset") 

24 

25import copy 

26import enum 

27import os 

28import re 

29import warnings 

30from collections import Counter 

31from collections.abc import Generator, Hashable, Iterable, MutableMapping 

32from dataclasses import dataclass, field 

33from typing import Any, Literal 

34 

35import yaml 

36from lsst.resources import ResourcePath, ResourcePathExpression 

37 

38 

39class _Tags(enum.Enum): 

40 KeepInstrument = enum.auto() 

41 

42 

43class PipelineYamlLoader(yaml.SafeLoader): 

44 """Specialized version of yaml's SafeLoader. 

45 

46 It checks and raises an exception if it finds that there are multiple 

47 instances of the same key found inside a pipeline file at a given scope. 

48 """ 

49 

50 def construct_mapping(self, node: yaml.MappingNode, deep: bool = False) -> dict[Hashable, Any]: 

51 # do the call to super first so that it can do all the other forms of 

52 # checking on this node. If you check the uniqueness of keys first 

53 # it would save the work that super does in the case of a failure, but 

54 # it might fail in the case that the node was the incorrect node due 

55 # to a parsing error, and the resulting exception would be difficult to 

56 # understand. 

57 mapping = super().construct_mapping(node, deep) 

58 # Check if there are any duplicate keys 

59 all_keys = Counter(key_node.value for key_node, _ in node.value) 

60 duplicates = {k for k, i in all_keys.items() if i != 1} 

61 if duplicates: 

62 raise KeyError( 

63 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

64 ) 

65 return mapping 

66 

67 

68class MultilineStringDumper(yaml.Dumper): 

69 """Custom YAML dumper that makes multi-line strings use the '|' 

70 continuation style instead of unreadable newlines and tons of quotes. 

71 

72 Basic approach is taken from 

73 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

74 but is written as a Dumper subclass to make its effects non-global (vs 

75 `yaml.add_representer`). 

76 """ 

77 

78 def represent_scalar(self, tag: str, value: Any, style: str | None = None) -> yaml.ScalarNode: 

79 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

80 style = "|" 

81 return super().represent_scalar(tag, value, style) 

82 

83 

84class ContractError(Exception): 

85 """An exception that is raised when a pipeline contract is not satisfied""" 

86 

87 pass 

88 

89 

90@dataclass 

91class ContractIR: 

92 """Intermediate representation of configuration contracts read from a 

93 pipeline yaml file. 

94 """ 

95 

96 contract: str 

97 """A string of python code representing one or more conditions on configs 

98 in a pipeline. This code-as-string should, once evaluated, should be True 

99 if the configs are fine, and False otherwise. 

100 """ 

101 msg: str | None = None 

102 """An optional message to be shown to the user if a contract fails 

103 """ 

104 

105 def to_primitives(self) -> dict[str, str]: 

106 """Convert to a representation used in yaml serialization""" 

107 accumulate = {"contract": self.contract} 

108 if self.msg is not None: 

109 accumulate["msg"] = self.msg 

110 return accumulate 

111 

112 def __eq__(self, other: object) -> bool: 

113 if not isinstance(other, ContractIR): 

114 return False 

115 elif self.contract == other.contract and self.msg == other.msg: 

116 return True 

117 else: 

118 return False 

119 

120 

121@dataclass 

122class LabeledSubset: 

123 """Intermediate representation of named subset of task labels read from 

124 a pipeline yaml file. 

125 """ 

126 

127 label: str 

128 """The label used to identify the subset of task labels. 

129 """ 

130 subset: set[str] 

131 """A set of task labels contained in this subset. 

132 """ 

133 description: str | None 

134 """A description of what this subset of tasks is intended to do 

135 """ 

136 

137 @staticmethod 

138 def from_primitives(label: str, value: list[str] | dict) -> LabeledSubset: 

139 """Generate `LabeledSubset` objects given a properly formatted object 

140 that as been created by a yaml loader. 

141 

142 Parameters 

143 ---------- 

144 label : `str` 

145 The label that will be used to identify this labeled subset. 

146 value : `list` of `str` or `dict` 

147 Object returned from loading a labeled subset section from a yaml 

148 document. 

149 

150 Returns 

151 ------- 

152 labeledSubset : `LabeledSubset` 

153 A `LabeledSubset` object build from the inputs. 

154 

155 Raises 

156 ------ 

157 ValueError 

158 Raised if the value input is not properly formatted for parsing 

159 """ 

160 if isinstance(value, MutableMapping): 

161 subset = value.pop("subset", None) 

162 if subset is None: 

163 raise ValueError( 

164 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

165 ) 

166 description = value.pop("description", None) 

167 elif isinstance(value, Iterable): 

168 subset = value 

169 description = None 

170 else: 

171 raise ValueError( 

172 f"There was a problem parsing the labeled subset {label}, make sure the " 

173 "definition is either a valid yaml list, or a mapping with keys " 

174 "(subset, description) where subset points to a yaml list, and description is " 

175 "associated with a string" 

176 ) 

177 return LabeledSubset(label, set(subset), description) 

178 

179 def to_primitives(self) -> dict[str, list[str] | str]: 

180 """Convert to a representation used in yaml serialization.""" 

181 accumulate: dict[str, list[str] | str] = {"subset": list(self.subset)} 

182 if self.description is not None: 

183 accumulate["description"] = self.description 

184 return accumulate 

185 

186 

187@dataclass 

188class ParametersIR: 

189 """Intermediate representation of parameters that are global to a pipeline 

190 

191 These parameters are specified under a top level key named ``parameters`` 

192 and are declared as a yaml mapping. These entries can then be used inside 

193 task configuration blocks to specify configuration values. They may not be 

194 used in the special ``file`` or ``python`` blocks. 

195 

196 Examples 

197 -------- 

198 .. code-block:: yaml 

199 

200 parameters: 

201 shared_value: 14 

202 tasks: 

203 taskA: 

204 class: modA 

205 config: 

206 field1: parameters.shared_value 

207 taskB: 

208 class: modB 

209 config: 

210 field2: parameters.shared_value 

211 """ 

212 

213 mapping: MutableMapping[str, str] 

214 """A mutable mapping of identifiers as keys, and shared configuration 

215 as values. 

216 """ 

217 

218 def update(self, other: ParametersIR | None) -> None: 

219 if other is not None: 

220 self.mapping.update(other.mapping) 

221 

222 def to_primitives(self) -> MutableMapping[str, str]: 

223 """Convert to a representation used in yaml serialization""" 

224 return self.mapping 

225 

226 def __contains__(self, value: str) -> bool: 

227 return value in self.mapping 

228 

229 def __getitem__(self, item: str) -> Any: 

230 return self.mapping[item] 

231 

232 def __bool__(self) -> bool: 

233 return bool(self.mapping) 

234 

235 

236@dataclass 

237class ConfigIR: 

238 """Intermediate representation of configurations read from a pipeline yaml 

239 file. 

240 """ 

241 

242 python: str | None = None 

243 """A string of python code that is used to modify a configuration. This can 

244 also be None if there are no modifications to do. 

245 """ 

246 dataId: dict | None = None 

247 """A dataId that is used to constrain these config overrides to only quanta 

248 with matching dataIds. This field can be None if there is no constraint. 

249 This is currently an unimplemented feature, and is placed here for future 

250 use. 

251 """ 

252 file: list[str] = field(default_factory=list) 

253 """A list of paths which points to a file containing config overrides to be 

254 applied. This value may be an empty list if there are no overrides to 

255 apply. 

256 """ 

257 rest: dict = field(default_factory=dict) 

258 """This is a dictionary of key value pairs, where the keys are strings 

259 corresponding to qualified fields on a config to override, and the values 

260 are strings representing the values to apply. 

261 """ 

262 

263 def to_primitives(self) -> dict[str, str | dict | list[str]]: 

264 """Convert to a representation used in yaml serialization""" 

265 accumulate = {} 

266 for name in ("python", "dataId", "file"): 

267 # if this attribute is thruthy add it to the accumulation 

268 # dictionary 

269 if getattr(self, name): 

270 accumulate[name] = getattr(self, name) 

271 # Add the dictionary containing the rest of the config keys to the 

272 # # accumulated dictionary 

273 accumulate.update(self.rest) 

274 return accumulate 

275 

276 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

277 """Return a new ConfigIR object that is formatted according to the 

278 specified parameters 

279 

280 Parameters 

281 ---------- 

282 parameters : ParametersIR 

283 Object that contains variable mappings used in substitution. 

284 

285 Returns 

286 ------- 

287 config : ConfigIR 

288 A new ConfigIR object formatted with the input parameters 

289 """ 

290 new_config = copy.deepcopy(self) 

291 for key, value in new_config.rest.items(): 

292 if not isinstance(value, str): 

293 continue 

294 match = re.match("parameters[.](.*)", value) 

295 if match and match.group(1) in parameters: 

296 new_config.rest[key] = parameters[match.group(1)] 

297 if match and match.group(1) not in parameters: 

298 warnings.warn( 

299 f"config {key} contains value {match.group(0)} which is formatted like a " 

300 "Pipeline parameter but was not found within the Pipeline, if this was not " 

301 "intentional, check for a typo" 

302 ) 

303 return new_config 

304 

305 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

306 """Merge another instance of a `ConfigIR` into this instance if 

307 possible. This function returns a generator that is either self 

308 if the configs were merged, or self, and other_config if that could 

309 not be merged. 

310 

311 Parameters 

312 ---------- 

313 other_config : `ConfigIR` 

314 An instance of `ConfigIR` to merge into this instance. 

315 

316 Returns 

317 ------- 

318 Generator : `ConfigIR` 

319 A generator containing either self, or self and other_config if 

320 the configs could be merged or not respectively. 

321 """ 

322 # Verify that the config blocks can be merged 

323 if ( 

324 self.dataId != other_config.dataId 

325 or self.python 

326 or other_config.python 

327 or self.file 

328 or other_config.file 

329 ): 

330 yield from (self, other_config) 

331 return 

332 

333 # create a set of all keys, and verify two keys do not have different 

334 # values 

335 key_union = self.rest.keys() & other_config.rest.keys() 

336 for key in key_union: 

337 if self.rest[key] != other_config.rest[key]: 

338 yield from (self, other_config) 

339 return 

340 self.rest.update(other_config.rest) 

341 

342 # Combine the lists of override files to load 

343 self_file_set = set(self.file) 

344 other_file_set = set(other_config.file) 

345 self.file = list(self_file_set.union(other_file_set)) 

346 

347 yield self 

348 

349 def __eq__(self, other: object) -> bool: 

350 if not isinstance(other, ConfigIR): 

351 return False 

352 elif all( 

353 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

354 ): 

355 return True 

356 else: 

357 return False 

358 

359 

360@dataclass 

361class TaskIR: 

362 """Intermediate representation of tasks read from a pipeline yaml file.""" 

363 

364 label: str 

365 """An identifier used to refer to a task. 

366 """ 

367 klass: str 

368 """A string containing a fully qualified python class to be run in a 

369 pipeline. 

370 """ 

371 config: list[ConfigIR] | None = None 

372 """list of all configs overrides associated with this task, and may be 

373 `None` if there are no config overrides. 

374 """ 

375 

376 def to_primitives(self) -> dict[str, str | list[dict]]: 

377 """Convert to a representation used in yaml serialization""" 

378 accumulate: dict[str, str | list[dict]] = {"class": self.klass} 

379 if self.config: 

380 accumulate["config"] = [c.to_primitives() for c in self.config] 

381 return accumulate 

382 

383 def add_or_update_config(self, other_config: ConfigIR) -> None: 

384 """Add a `ConfigIR` to this task if one is not present. Merges configs 

385 if there is a `ConfigIR` present and the dataId keys of both configs 

386 match, otherwise adds a new entry to the config list. The exception to 

387 the above is that if either the last config or other_config has a 

388 python block, then other_config is always added, as python blocks can 

389 modify configs in ways that cannot be predicted. 

390 

391 Parameters 

392 ---------- 

393 other_config : `ConfigIR` 

394 A `ConfigIR` instance to add or merge into the config attribute of 

395 this task. 

396 """ 

397 if not self.config: 

398 self.config = [other_config] 

399 return 

400 self.config.extend(self.config.pop().maybe_merge(other_config)) 

401 

402 def __eq__(self, other: object) -> bool: 

403 if not isinstance(other, TaskIR): 

404 return False 

405 elif all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")): 

406 return True 

407 else: 

408 return False 

409 

410 

411@dataclass 

412class ImportIR: 

413 """An intermediate representation of imported pipelines""" 

414 

415 location: str 

416 """This is the location of the pipeline to inherit. The path should be 

417 specified as an absolute path. Environment variables may be used in the 

418 path and should be specified as a python string template, with the name of 

419 the environment variable inside braces. 

420 """ 

421 include: list[str] | None = None 

422 """list of tasks that should be included when inheriting this pipeline. 

423 Either the include or exclude attributes may be specified, but not both. 

424 """ 

425 exclude: list[str] | None = None 

426 """list of tasks that should be excluded when inheriting this pipeline. 

427 Either the include or exclude attributes may be specified, but not both. 

428 """ 

429 importContracts: bool = True 

430 """Boolean attribute to dictate if contracts should be inherited with the 

431 pipeline or not. 

432 """ 

433 instrument: Literal[_Tags.KeepInstrument] | str | None = _Tags.KeepInstrument 

434 """Instrument to assign to the Pipeline at import. The default value of 

435 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

436 declared with will not be modified. setting this value to None will drop 

437 any declared instrument prior to import. 

438 """ 

439 

440 def toPipelineIR(self) -> "PipelineIR": 

441 """Load in the Pipeline specified by this object, and turn it into a 

442 PipelineIR instance. 

443 

444 Returns 

445 ------- 

446 pipeline : `PipelineIR` 

447 A pipeline generated from the imported pipeline file 

448 """ 

449 if self.include and self.exclude: 

450 raise ValueError( 

451 "An include list and an exclude list cannot both be specified" 

452 " when declaring a pipeline import." 

453 ) 

454 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

455 if self.instrument is not _Tags.KeepInstrument: 

456 tmp_pipeline.instrument = self.instrument 

457 

458 included_labels = set() 

459 for label in tmp_pipeline.tasks: 

460 if ( 

461 (self.include and label in self.include) 

462 or (self.exclude and label not in self.exclude) 

463 or (self.include is None and self.exclude is None) 

464 ): 

465 included_labels.add(label) 

466 

467 # Handle labeled subsets being specified in the include or exclude 

468 # list, adding or removing labels. 

469 if self.include is not None: 

470 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

471 for label in subsets_in_include: 

472 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

473 

474 elif self.exclude is not None: 

475 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

476 for label in subsets_in_exclude: 

477 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

478 

479 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels) 

480 

481 if not self.importContracts: 

482 tmp_pipeline.contracts = [] 

483 

484 return tmp_pipeline 

485 

486 def __eq__(self, other: object) -> bool: 

487 if not isinstance(other, ImportIR): 

488 return False 

489 elif all( 

490 getattr(self, attr) == getattr(other, attr) 

491 for attr in ("location", "include", "exclude", "importContracts") 

492 ): 

493 return True 

494 else: 

495 return False 

496 

497 

498class PipelineIR: 

499 """Intermediate representation of a pipeline definition 

500 

501 Parameters 

502 ---------- 

503 loaded_yaml : `dict` 

504 A dictionary which matches the structure that would be produced by a 

505 yaml reader which parses a pipeline definition document 

506 

507 Raises 

508 ------ 

509 ValueError 

510 Raised if: 

511 

512 - a pipeline is declared without a description; 

513 - no tasks are declared in a pipeline, and no pipelines are to be 

514 inherited; 

515 - more than one instrument is specified; 

516 - more than one inherited pipeline share a label. 

517 """ 

518 

519 def __init__(self, loaded_yaml: dict[str, Any]): 

520 # Check required fields are present 

521 if "description" not in loaded_yaml: 

522 raise ValueError("A pipeline must be declared with a description") 

523 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

524 raise ValueError("A pipeline must be declared with one or more tasks") 

525 

526 # These steps below must happen in this call order 

527 

528 # Process pipeline description 

529 self.description = loaded_yaml.pop("description") 

530 

531 # Process tasks 

532 self._read_tasks(loaded_yaml) 

533 

534 # Process instrument keys 

535 inst = loaded_yaml.pop("instrument", None) 

536 if isinstance(inst, list): 

537 raise ValueError("Only one top level instrument can be defined in a pipeline") 

538 self.instrument: str | None = inst 

539 

540 # Process any contracts 

541 self._read_contracts(loaded_yaml) 

542 

543 # Process any defined parameters 

544 self._read_parameters(loaded_yaml) 

545 

546 # Process any named label subsets 

547 self._read_labeled_subsets(loaded_yaml) 

548 

549 # Process any inherited pipelines 

550 self._read_imports(loaded_yaml) 

551 

552 # verify named subsets, must be done after inheriting 

553 self._verify_labeled_subsets() 

554 

555 def _read_contracts(self, loaded_yaml: dict[str, Any]) -> None: 

556 """Process the contracts portion of the loaded yaml document 

557 

558 Parameters 

559 ---------- 

560 loaded_yaml : `dict` 

561 A dictionary which matches the structure that would be produced by 

562 a yaml reader which parses a pipeline definition document 

563 """ 

564 loaded_contracts = loaded_yaml.pop("contracts", []) 

565 if isinstance(loaded_contracts, str): 

566 loaded_contracts = [loaded_contracts] 

567 self.contracts: list[ContractIR] = [] 

568 for contract in loaded_contracts: 

569 if isinstance(contract, dict): 

570 self.contracts.append(ContractIR(**contract)) 

571 if isinstance(contract, str): 

572 self.contracts.append(ContractIR(contract=contract)) 

573 

574 def _read_parameters(self, loaded_yaml: dict[str, Any]) -> None: 

575 """Process the parameters portion of the loaded yaml document 

576 

577 Parameters 

578 ---------- 

579 loaded_yaml : `dict` 

580 A dictionary which matches the structure that would be produced by 

581 a yaml reader which parses a pipeline definition document 

582 """ 

583 loaded_parameters = loaded_yaml.pop("parameters", {}) 

584 if not isinstance(loaded_parameters, dict): 

585 raise ValueError("The parameters section must be a yaml mapping") 

586 self.parameters = ParametersIR(loaded_parameters) 

587 

588 def _read_labeled_subsets(self, loaded_yaml: dict[str, Any]) -> None: 

589 """Process the subsets portion of the loaded yaml document 

590 

591 Parameters 

592 ---------- 

593 loaded_yaml: `MutableMapping` 

594 A dictionary which matches the structure that would be produced 

595 by a yaml reader which parses a pipeline definition document 

596 """ 

597 loaded_subsets = loaded_yaml.pop("subsets", {}) 

598 self.labeled_subsets: dict[str, LabeledSubset] = {} 

599 if not loaded_subsets and "subset" in loaded_yaml: 

600 raise ValueError("Top level key should be subsets and not subset, add an s") 

601 for key, value in loaded_subsets.items(): 

602 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

603 

604 def _verify_labeled_subsets(self) -> None: 

605 """Verify that all the labels in each named subset exist within the 

606 pipeline. 

607 """ 

608 # Verify that all labels defined in a labeled subset are in the 

609 # Pipeline 

610 for labeled_subset in self.labeled_subsets.values(): 

611 if not labeled_subset.subset.issubset(self.tasks.keys()): 

612 raise ValueError( 

613 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

614 "declared pipeline" 

615 ) 

616 # Verify subset labels are not already task labels 

617 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

618 if label_intersection: 

619 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

620 

621 def _read_imports(self, loaded_yaml: dict[str, Any]) -> None: 

622 """Process the inherits portion of the loaded yaml document 

623 

624 Parameters 

625 ---------- 

626 loaded_yaml : `dict` 

627 A dictionary which matches the structure that would be produced by 

628 a yaml reader which parses a pipeline definition document 

629 """ 

630 

631 def process_args(argument: str | dict) -> dict: 

632 if isinstance(argument, str): 

633 return {"location": argument} 

634 elif isinstance(argument, dict): 

635 if "exclude" in argument and isinstance(argument["exclude"], str): 

636 argument["exclude"] = [argument["exclude"]] 

637 if "include" in argument and isinstance(argument["include"], str): 

638 argument["include"] = [argument["include"]] 

639 if "instrument" in argument and argument["instrument"] == "None": 

640 argument["instrument"] = None 

641 return argument 

642 

643 if not {"inherits", "imports"} - loaded_yaml.keys(): 

644 raise ValueError("Cannot define both inherits and imports sections, use imports") 

645 tmp_import = loaded_yaml.pop("inherits", None) 

646 if tmp_import is None: 

647 tmp_import = loaded_yaml.pop("imports", None) 

648 else: 

649 raise ValueError("The 'inherits' key is not supported. Please use the key 'imports' instead") 

650 if tmp_import is None: 

651 self.imports: list[ImportIR] = [] 

652 elif isinstance(tmp_import, list): 

653 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

654 else: 

655 self.imports = [ImportIR(**process_args(tmp_import))] 

656 

657 self.merge_pipelines([fragment.toPipelineIR() for fragment in self.imports]) 

658 

659 def merge_pipelines(self, pipelines: Iterable[PipelineIR]) -> None: 

660 """Merge one or more other `PipelineIR` objects into this object. 

661 

662 Parameters 

663 ---------- 

664 pipelines : `~collections.abc.Iterable` of `PipelineIR` objects 

665 An `~collections.abc.Iterable` that contains one or more 

666 `PipelineIR` objects to merge into this object. 

667 

668 Raises 

669 ------ 

670 ValueError 

671 Raised if there is a conflict in instrument specifications. 

672 Raised if a task label appears in more than one of the input 

673 `PipelineIR` objects which are to be merged. 

674 Raised if a labeled subset appears in more than one of the input 

675 `PipelineIR` objects which are to be merged, and with any subset 

676 existing in this object. 

677 """ 

678 # integrate any imported pipelines 

679 accumulate_tasks: dict[str, TaskIR] = {} 

680 accumulate_labeled_subsets: dict[str, LabeledSubset] = {} 

681 accumulated_parameters = ParametersIR({}) 

682 

683 for tmp_IR in pipelines: 

684 if self.instrument is None: 

685 self.instrument = tmp_IR.instrument 

686 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

687 msg = ( 

688 "Only one instrument can be declared in a pipeline or its imports. " 

689 f"Top level pipeline defines {self.instrument} but pipeline to merge " 

690 f"defines {tmp_IR.instrument}." 

691 ) 

692 raise ValueError(msg) 

693 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

694 msg = ( 

695 "Task labels in the imported pipelines must be unique. " 

696 f"These labels appear multiple times: {duplicate_labels}" 

697 ) 

698 raise ValueError(msg) 

699 accumulate_tasks.update(tmp_IR.tasks) 

700 self.contracts.extend(tmp_IR.contracts) 

701 # verify that tmp_IR has unique labels for named subset among 

702 # existing labeled subsets, and with existing task labels. 

703 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

704 task_subset_overlap = ( 

705 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

706 ) & accumulate_tasks.keys() 

707 if overlapping_subsets or task_subset_overlap: 

708 raise ValueError( 

709 "Labeled subset names must be unique amongst imports in both labels and " 

710 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

711 ) 

712 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

713 accumulated_parameters.update(tmp_IR.parameters) 

714 

715 # verify that any accumulated labeled subsets dont clash with a label 

716 # from this pipeline 

717 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

718 raise ValueError( 

719 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

720 ) 

721 # merge in the named subsets for self so this document can override any 

722 # that have been delcared 

723 accumulate_labeled_subsets.update(self.labeled_subsets) 

724 self.labeled_subsets = accumulate_labeled_subsets 

725 

726 # merge the dict of label:TaskIR objects, preserving any configs in the 

727 # imported pipeline if the labels point to the same class 

728 for label, task in self.tasks.items(): 

729 if label not in accumulate_tasks: 

730 accumulate_tasks[label] = task 

731 elif accumulate_tasks[label].klass == task.klass: 

732 if task.config is not None: 

733 for config in task.config: 

734 accumulate_tasks[label].add_or_update_config(config) 

735 else: 

736 accumulate_tasks[label] = task 

737 self.tasks: dict[str, TaskIR] = accumulate_tasks 

738 accumulated_parameters.update(self.parameters) 

739 self.parameters = accumulated_parameters 

740 

741 def _read_tasks(self, loaded_yaml: dict[str, Any]) -> None: 

742 """Process the tasks portion of the loaded yaml document 

743 

744 Parameters 

745 ---------- 

746 loaded_yaml : `dict` 

747 A dictionary which matches the structure that would be produced by 

748 a yaml reader which parses a pipeline definition document 

749 """ 

750 self.tasks = {} 

751 tmp_tasks = loaded_yaml.pop("tasks", None) 

752 if tmp_tasks is None: 

753 tmp_tasks = {} 

754 

755 if "parameters" in tmp_tasks: 

756 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

757 

758 for label, definition in tmp_tasks.items(): 

759 if isinstance(definition, str): 

760 definition = {"class": definition} 

761 config = definition.get("config", None) 

762 if config is None: 

763 task_config_ir = None 

764 else: 

765 if isinstance(config, dict): 

766 config = [config] 

767 task_config_ir = [] 

768 for c in config: 

769 file = c.pop("file", None) 

770 if file is None: 

771 file = [] 

772 elif not isinstance(file, list): 

773 file = [file] 

774 task_config_ir.append( 

775 ConfigIR( 

776 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

777 ) 

778 ) 

779 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

780 

781 def _remove_contracts(self, label: str) -> None: 

782 """Remove any contracts that contain the given label 

783 

784 String comparison used in this way is not the most elegant and may 

785 have issues, but it is the only feasible way when users can specify 

786 contracts with generic strings. 

787 """ 

788 new_contracts = [] 

789 for contract in self.contracts: 

790 # match a label that is not preceded by an ASCII identifier, or 

791 # is the start of a line and is followed by a dot 

792 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

793 continue 

794 new_contracts.append(contract) 

795 self.contracts = new_contracts 

796 

797 def subset_from_labels(self, labelSpecifier: set[str]) -> PipelineIR: 

798 """Subset a pipelineIR to contain only labels specified in 

799 labelSpecifier. 

800 

801 Parameters 

802 ---------- 

803 labelSpecifier : `set` of `str` 

804 set containing labels that describes how to subset a pipeline. 

805 

806 Returns 

807 ------- 

808 pipeline : `PipelineIR` 

809 A new pipelineIR object that is a subset of the old pipelineIR 

810 

811 Raises 

812 ------ 

813 ValueError 

814 Raised if there is an issue with specified labels 

815 

816 Notes 

817 ----- 

818 This method attempts to prune any contracts that contain labels which 

819 are not in the declared subset of labels. This pruning is done using a 

820 string based matching due to the nature of contracts and may prune more 

821 than it should. Any labeled subsets defined that no longer have all 

822 members of the subset present in the pipeline will be removed from the 

823 resulting pipeline. 

824 """ 

825 pipeline = copy.deepcopy(self) 

826 

827 # update the label specifier to expand any named subsets 

828 toRemove = set() 

829 toAdd = set() 

830 for label in labelSpecifier: 

831 if label in pipeline.labeled_subsets: 

832 toRemove.add(label) 

833 toAdd.update(pipeline.labeled_subsets[label].subset) 

834 labelSpecifier.difference_update(toRemove) 

835 labelSpecifier.update(toAdd) 

836 # verify all the labels are in the pipeline 

837 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

838 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

839 raise ValueError( 

840 "Not all supplied labels (specified or named subsets) are in the pipeline " 

841 f"definition, extra labels: {difference}" 

842 ) 

843 # copy needed so as to not modify while iterating 

844 pipeline_labels = set(pipeline.tasks.keys()) 

845 # Remove the labels from the pipelineIR, and any contracts that contain 

846 # those labels (see docstring on _remove_contracts for why this may 

847 # cause issues) 

848 for label in pipeline_labels: 

849 if label not in labelSpecifier: 

850 pipeline.tasks.pop(label) 

851 pipeline._remove_contracts(label) 

852 

853 # create a copy of the object to iterate over 

854 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

855 # remove any labeled subsets that no longer have a complete set 

856 for label, labeled_subset in labeled_subsets.items(): 

857 if labeled_subset.subset - pipeline.tasks.keys(): 

858 pipeline.labeled_subsets.pop(label) 

859 

860 return pipeline 

861 

862 @classmethod 

863 def from_string(cls, pipeline_string: str) -> PipelineIR: 

864 """Create a `PipelineIR` object from a string formatted like a pipeline 

865 document 

866 

867 Parameters 

868 ---------- 

869 pipeline_string : `str` 

870 A string that is formatted according like a pipeline document 

871 """ 

872 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

873 return cls(loaded_yaml) 

874 

875 @classmethod 

876 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

877 """Create a `PipelineIR` object from the document specified by the 

878 input uri. 

879 

880 Parameters 

881 ---------- 

882 uri: convertible to `~lsst.resources.ResourcePath` 

883 Location of document to use in creating a `PipelineIR` object. 

884 

885 Returns 

886 ------- 

887 pipelineIR : `PipelineIR` 

888 The loaded pipeline 

889 """ 

890 loaded_uri = ResourcePath(uri) 

891 with loaded_uri.open("r") as buffer: 

892 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader) 

893 return cls(loaded_yaml) 

894 

895 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

896 """Serialize this `PipelineIR` object into a yaml formatted string and 

897 write the output to a file at the specified uri. 

898 

899 Parameters 

900 ---------- 

901 uri: convertible to `~lsst.resources.ResourcePath` 

902 Location of document to write a `PipelineIR` object. 

903 """ 

904 with ResourcePath(uri).open("w") as buffer: 

905 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

906 

907 def to_primitives(self) -> dict[str, Any]: 

908 """Convert to a representation used in yaml serialization 

909 

910 Returns 

911 ------- 

912 primitives : `dict` 

913 dictionary that maps directly to the serialized YAML form. 

914 """ 

915 accumulate = {"description": self.description} 

916 if self.instrument is not None: 

917 accumulate["instrument"] = self.instrument 

918 if self.parameters: 

919 accumulate["parameters"] = self.parameters.to_primitives() 

920 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

921 if len(self.contracts) > 0: 

922 # sort contracts lexicographical order by the contract string in 

923 # absence of any other ordering principle 

924 contracts_list = [c.to_primitives() for c in self.contracts] 

925 contracts_list.sort(key=lambda x: x["contract"]) 

926 accumulate["contracts"] = contracts_list 

927 if self.labeled_subsets: 

928 accumulate["subsets"] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

929 return accumulate 

930 

931 def __str__(self) -> str: 

932 """Instance formatting as how it would look in yaml representation""" 

933 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

934 

935 def __repr__(self) -> str: 

936 """Instance formatting as how it would look in yaml representation""" 

937 return str(self) 

938 

939 def __eq__(self, other: object) -> bool: 

940 if not isinstance(other, PipelineIR): 

941 return False 

942 # special case contracts because it is a list, but order is not 

943 # important 

944 elif ( 

945 all( 

946 getattr(self, attr) == getattr(other, attr) 

947 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

948 ) 

949 and len(self.contracts) == len(other.contracts) 

950 and all(c in self.contracts for c in other.contracts) 

951 ): 

952 return True 

953 else: 

954 return False