Coverage for python/lsst/pipe/base/pipelineIR.py: 21%

448 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-19 11:28 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ( 

30 "ConfigIR", 

31 "ContractError", 

32 "ContractIR", 

33 "ImportIR", 

34 "LabeledSubset", 

35 "ParametersIR", 

36 "PipelineIR", 

37 "TaskIR", 

38) 

39 

40import copy 

41import enum 

42import os 

43import re 

44import warnings 

45from collections import Counter 

46from collections.abc import Generator, Hashable, Iterable, MutableMapping 

47from dataclasses import dataclass, field 

48from typing import Any, Literal 

49 

50import yaml 

51from lsst.resources import ResourcePath, ResourcePathExpression 

52from lsst.utils.introspection import find_outside_stacklevel 

53 

54 

55class PipelineSubsetCtrl(enum.Enum): 

56 """An Enumeration of the various ways a pipeline subsetting operation will 

57 handle labeled subsets when task labels they defined are missing. 

58 """ 

59 

60 DROP = enum.auto() 

61 """Drop any subsets that contain labels which are no longer in the set of 

62 task labels when subsetting an entire pipeline 

63 """ 

64 EDIT = enum.auto() 

65 """Edit any subsets that contain labels which are no longer in the set of 

66 task labels to remove the missing label, but leave the subset when 

67 subsetting a pipeline. 

68 """ 

69 

70 

71class _Tags(enum.Enum): 

72 KeepInstrument = enum.auto() 

73 

74 

75class PipelineYamlLoader(yaml.SafeLoader): 

76 """Specialized version of yaml's SafeLoader. 

77 

78 It checks and raises an exception if it finds that there are multiple 

79 instances of the same key found inside a pipeline file at a given scope. 

80 """ 

81 

82 def construct_mapping(self, node: yaml.MappingNode, deep: bool = False) -> dict[Hashable, Any]: 

83 # do the call to super first so that it can do all the other forms of 

84 # checking on this node. If you check the uniqueness of keys first 

85 # it would save the work that super does in the case of a failure, but 

86 # it might fail in the case that the node was the incorrect node due 

87 # to a parsing error, and the resulting exception would be difficult to 

88 # understand. 

89 mapping = super().construct_mapping(node, deep) 

90 # Check if there are any duplicate keys 

91 all_keys = Counter(key_node.value for key_node, _ in node.value) 

92 duplicates = {k for k, i in all_keys.items() if i != 1} 

93 if duplicates: 

94 raise KeyError( 

95 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times" 

96 ) 

97 return mapping 

98 

99 

100class MultilineStringDumper(yaml.Dumper): 

101 """Custom YAML dumper that makes multi-line strings use the '|' 

102 continuation style instead of unreadable newlines and tons of quotes. 

103 

104 Basic approach is taken from 

105 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data, 

106 but is written as a Dumper subclass to make its effects non-global (vs 

107 `yaml.add_representer`). 

108 """ 

109 

110 def represent_scalar(self, tag: str, value: Any, style: str | None = None) -> yaml.ScalarNode: 

111 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1: 

112 style = "|" 

113 return super().represent_scalar(tag, value, style) 

114 

115 

116class ContractError(Exception): 

117 """An exception that is raised when a pipeline contract is not 

118 satisfied. 

119 """ 

120 

121 pass 

122 

123 

124@dataclass 

125class ContractIR: 

126 """Intermediate representation of configuration contracts read from a 

127 pipeline yaml file. 

128 """ 

129 

130 contract: str 

131 """A string of python code representing one or more conditions on configs 

132 in a pipeline. This code-as-string should, once evaluated, should be True 

133 if the configs are fine, and False otherwise. 

134 """ 

135 msg: str | None = None 

136 """An optional message to be shown to the user if a contract fails 

137 """ 

138 

139 def to_primitives(self) -> dict[str, str]: 

140 """Convert to a representation used in yaml serialization.""" 

141 accumulate = {"contract": self.contract} 

142 if self.msg is not None: 

143 accumulate["msg"] = self.msg 

144 return accumulate 

145 

146 def __eq__(self, other: object) -> bool: 

147 if not isinstance(other, ContractIR): 

148 return False 

149 return self.contract == other.contract and self.msg == other.msg 

150 

151 

152@dataclass 

153class LabeledSubset: 

154 """Intermediate representation of named subset of task labels read from 

155 a pipeline yaml file. 

156 """ 

157 

158 label: str 

159 """The label used to identify the subset of task labels. 

160 """ 

161 subset: set[str] 

162 """A set of task labels contained in this subset. 

163 """ 

164 description: str | None 

165 """A description of what this subset of tasks is intended to do 

166 """ 

167 

168 @staticmethod 

169 def from_primitives(label: str, value: list[str] | dict) -> LabeledSubset: 

170 """Generate `LabeledSubset` objects given a properly formatted object 

171 that as been created by a yaml loader. 

172 

173 Parameters 

174 ---------- 

175 label : `str` 

176 The label that will be used to identify this labeled subset. 

177 value : `list` of `str` or `dict` 

178 Object returned from loading a labeled subset section from a yaml 

179 document. 

180 

181 Returns 

182 ------- 

183 labeledSubset : `LabeledSubset` 

184 A `LabeledSubset` object build from the inputs. 

185 

186 Raises 

187 ------ 

188 ValueError 

189 Raised if the value input is not properly formatted for parsing 

190 """ 

191 if isinstance(value, MutableMapping): 

192 subset = value.pop("subset", None) 

193 if subset is None: 

194 raise ValueError( 

195 "If a labeled subset is specified as a mapping, it must contain the key 'subset'" 

196 ) 

197 description = value.pop("description", None) 

198 elif isinstance(value, Iterable): 

199 subset = value 

200 description = None 

201 else: 

202 raise ValueError( 

203 f"There was a problem parsing the labeled subset {label}, make sure the " 

204 "definition is either a valid yaml list, or a mapping with keys " 

205 "(subset, description) where subset points to a yaml list, and description is " 

206 "associated with a string" 

207 ) 

208 return LabeledSubset(label, set(subset), description) 

209 

210 def to_primitives(self) -> dict[str, list[str] | str]: 

211 """Convert to a representation used in yaml serialization.""" 

212 accumulate: dict[str, list[str] | str] = {"subset": list(self.subset)} 

213 if self.description is not None: 

214 accumulate["description"] = self.description 

215 return accumulate 

216 

217 

218@dataclass 

219class ParametersIR: 

220 """Intermediate representation of parameters that are global to a pipeline. 

221 

222 Attributes 

223 ---------- 

224 mapping : `dict` [`str`, `str`] 

225 A mutable mapping of identifiers as keys, and shared configuration 

226 as values. 

227 

228 Notes 

229 ----- 

230 These parameters are specified under a top level key named ``parameters`` 

231 and are declared as a yaml mapping. These entries can then be used inside 

232 task configuration blocks to specify configuration values. They may not be 

233 used in the special ``file`` or ``python`` blocks. 

234 

235 Examples 

236 -------- 

237 .. code-block:: yaml 

238 

239 \u200bparameters: 

240 shared_value: 14 

241 tasks: 

242 taskA: 

243 class: modA 

244 config: 

245 field1: parameters.shared_value 

246 taskB: 

247 class: modB 

248 config: 

249 field2: parameters.shared_value 

250 """ 

251 

252 mapping: MutableMapping[str, Any] 

253 """A mutable mapping of identifiers as keys, and shared configuration 

254 as values. 

255 """ 

256 

257 def update(self, other: ParametersIR | None) -> None: 

258 if other is not None: 

259 self.mapping.update(other.mapping) 

260 

261 def to_primitives(self) -> MutableMapping[str, str]: 

262 """Convert to a representation used in yaml serialization.""" 

263 return self.mapping 

264 

265 def __contains__(self, value: str) -> bool: 

266 return value in self.mapping 

267 

268 def __getitem__(self, item: str) -> Any: 

269 return self.mapping[item] 

270 

271 def __bool__(self) -> bool: 

272 return bool(self.mapping) 

273 

274 

275@dataclass 

276class ConfigIR: 

277 """Intermediate representation of configurations read from a pipeline yaml 

278 file. 

279 """ 

280 

281 python: str | None = None 

282 """A string of python code that is used to modify a configuration. This can 

283 also be None if there are no modifications to do. 

284 """ 

285 dataId: dict | None = None 

286 """A dataId that is used to constrain these config overrides to only quanta 

287 with matching dataIds. This field can be None if there is no constraint. 

288 This is currently an unimplemented feature, and is placed here for future 

289 use. 

290 """ 

291 file: list[str] = field(default_factory=list) 

292 """A list of paths which points to a file containing config overrides to be 

293 applied. This value may be an empty list if there are no overrides to 

294 apply. 

295 """ 

296 rest: dict = field(default_factory=dict) 

297 """This is a dictionary of key value pairs, where the keys are strings 

298 corresponding to qualified fields on a config to override, and the values 

299 are strings representing the values to apply. 

300 """ 

301 

302 def to_primitives(self) -> dict[str, str | dict | list[str]]: 

303 """Convert to a representation used in yaml serialization.""" 

304 accumulate = {} 

305 for name in ("python", "dataId", "file"): 

306 # if this attribute is thruthy add it to the accumulation 

307 # dictionary 

308 if getattr(self, name): 

309 accumulate[name] = getattr(self, name) 

310 # Add the dictionary containing the rest of the config keys to the 

311 # # accumulated dictionary 

312 accumulate.update(self.rest) 

313 return accumulate 

314 

315 def formatted(self, parameters: ParametersIR) -> ConfigIR: 

316 """Return a new ConfigIR object that is formatted according to the 

317 specified parameters. 

318 

319 Parameters 

320 ---------- 

321 parameters : `ParametersIR` 

322 Object that contains variable mappings used in substitution. 

323 

324 Returns 

325 ------- 

326 config : `ConfigIR` 

327 A new ConfigIR object formatted with the input parameters. 

328 """ 

329 new_config = copy.deepcopy(self) 

330 for key, value in new_config.rest.items(): 

331 if not isinstance(value, str): 

332 continue 

333 match = re.match("parameters[.](.*)", value) 

334 if match and match.group(1) in parameters: 

335 new_config.rest[key] = parameters[match.group(1)] 

336 if match and match.group(1) not in parameters: 

337 warnings.warn( 

338 f"config {key} contains value {match.group(0)} which is formatted like a " 

339 "Pipeline parameter but was not found within the Pipeline, if this was not " 

340 "intentional, check for a typo", 

341 stacklevel=find_outside_stacklevel("lsst.pipe.base"), 

342 ) 

343 return new_config 

344 

345 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

346 """Merge another instance of a `ConfigIR` into this instance if 

347 possible. This function returns a generator that is either self 

348 if the configs were merged, or self, and other_config if that could 

349 not be merged. 

350 

351 Parameters 

352 ---------- 

353 other_config : `ConfigIR` 

354 An instance of `ConfigIR` to merge into this instance. 

355 

356 Yields 

357 ------ 

358 Generator : `ConfigIR` 

359 A generator containing either self, or self and other_config if 

360 the configs could be merged or not respectively. 

361 """ 

362 # Verify that the config blocks can be merged 

363 if ( 

364 self.dataId != other_config.dataId 

365 or self.python 

366 or other_config.python 

367 or self.file 

368 or other_config.file 

369 ): 

370 yield from (self, other_config) 

371 return 

372 

373 # create a set of all keys, and verify two keys do not have different 

374 # values 

375 key_union = self.rest.keys() & other_config.rest.keys() 

376 for key in key_union: 

377 if self.rest[key] != other_config.rest[key]: 

378 yield from (self, other_config) 

379 return 

380 self.rest.update(other_config.rest) 

381 

382 # Combine the lists of override files to load 

383 self_file_set = set(self.file) 

384 other_file_set = set(other_config.file) 

385 self.file = list(self_file_set.union(other_file_set)) 

386 

387 yield self 

388 

389 def __eq__(self, other: object) -> bool: 

390 if not isinstance(other, ConfigIR): 

391 return False 

392 return all( 

393 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest") 

394 ) 

395 

396 

397@dataclass 

398class TaskIR: 

399 """Intermediate representation of tasks read from a pipeline yaml file.""" 

400 

401 label: str 

402 """An identifier used to refer to a task. 

403 """ 

404 klass: str 

405 """A string containing a fully qualified python class to be run in a 

406 pipeline. 

407 """ 

408 config: list[ConfigIR] | None = None 

409 """list of all configs overrides associated with this task, and may be 

410 `None` if there are no config overrides. 

411 """ 

412 

413 def to_primitives(self) -> dict[str, str | list[dict]]: 

414 """Convert to a representation used in yaml serialization.""" 

415 accumulate: dict[str, str | list[dict]] = {"class": self.klass} 

416 if self.config: 

417 accumulate["config"] = [c.to_primitives() for c in self.config] 

418 return accumulate 

419 

420 def add_or_update_config(self, other_config: ConfigIR) -> None: 

421 """Add a `ConfigIR` to this task if one is not present. Merges configs 

422 if there is a `ConfigIR` present and the dataId keys of both configs 

423 match, otherwise adds a new entry to the config list. The exception to 

424 the above is that if either the last config or other_config has a 

425 python block, then other_config is always added, as python blocks can 

426 modify configs in ways that cannot be predicted. 

427 

428 Parameters 

429 ---------- 

430 other_config : `ConfigIR` 

431 A `ConfigIR` instance to add or merge into the config attribute of 

432 this task. 

433 """ 

434 if not self.config: 

435 self.config = [other_config] 

436 return 

437 self.config.extend(self.config.pop().maybe_merge(other_config)) 

438 

439 def __eq__(self, other: object) -> bool: 

440 if not isinstance(other, TaskIR): 

441 return False 

442 return all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")) 

443 

444 

445@dataclass 

446class ImportIR: 

447 """An intermediate representation of imported pipelines.""" 

448 

449 location: str 

450 """This is the location of the pipeline to inherit. The path should be 

451 specified as an absolute path. Environment variables may be used in the 

452 path and should be specified as a python string template, with the name of 

453 the environment variable inside braces. 

454 """ 

455 include: list[str] | None = None 

456 """list of tasks that should be included when inheriting this pipeline. 

457 Either the include or exclude attributes may be specified, but not both. 

458 """ 

459 exclude: list[str] | None = None 

460 """list of tasks that should be excluded when inheriting this pipeline. 

461 Either the include or exclude attributes may be specified, but not both. 

462 """ 

463 importContracts: bool = True 

464 """Boolean attribute to dictate if contracts should be inherited with the 

465 pipeline or not. 

466 """ 

467 importSteps: bool = True 

468 """Boolean attribute to dictate if steps should be inherited with the 

469 pipeline or not. 

470 """ 

471 labeledSubsetModifyMode: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP 

472 """Controls how labeled subsets are handled when an import ends up not 

473 including (either through an include or exclusion list) a task label that 

474 is defined in the `Pipeline` being imported. DROP will remove any 

475 subsets which contain a missing label. EDIT will change any subsets to not 

476 include the missing label. 

477 """ 

478 instrument: Literal[_Tags.KeepInstrument] | str | None = _Tags.KeepInstrument 

479 """Instrument to assign to the Pipeline at import. The default value of 

480 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is 

481 declared with will not be modified. setting this value to None will drop 

482 any declared instrument prior to import. 

483 """ 

484 

485 def toPipelineIR(self) -> "PipelineIR": 

486 """Load in the Pipeline specified by this object, and turn it into a 

487 PipelineIR instance. 

488 

489 Returns 

490 ------- 

491 pipeline : `PipelineIR` 

492 A pipeline generated from the imported pipeline file. 

493 """ 

494 if self.include and self.exclude: 

495 raise ValueError( 

496 "An include list and an exclude list cannot both be specified" 

497 " when declaring a pipeline import." 

498 ) 

499 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location)) 

500 if self.instrument is not _Tags.KeepInstrument: 

501 tmp_pipeline.instrument = self.instrument 

502 

503 included_labels = set() 

504 for label in tmp_pipeline.tasks: 

505 if ( 

506 (self.include and label in self.include) 

507 or (self.exclude and label not in self.exclude) 

508 or (self.include is None and self.exclude is None) 

509 ): 

510 included_labels.add(label) 

511 

512 # Handle labeled subsets being specified in the include or exclude 

513 # list, adding or removing labels. 

514 if self.include is not None: 

515 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include 

516 for label in subsets_in_include: 

517 included_labels.update(tmp_pipeline.labeled_subsets[label].subset) 

518 

519 elif self.exclude is not None: 

520 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude 

521 for label in subsets_in_exclude: 

522 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset) 

523 

524 if not self.importSteps: 

525 tmp_pipeline.steps = [] 

526 

527 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels, self.labeledSubsetModifyMode) 

528 

529 if not self.importContracts: 

530 tmp_pipeline.contracts = [] 

531 

532 return tmp_pipeline 

533 

534 def __eq__(self, other: object) -> bool: 

535 if not isinstance(other, ImportIR): 

536 return False 

537 return all( 

538 getattr(self, attr) == getattr(other, attr) 

539 for attr in ("location", "include", "exclude", "importContracts") 

540 ) 

541 

542 

543@dataclass 

544class StepIR: 

545 """Intermediate representation of a step definition.""" 

546 

547 label: str 

548 """The label associated with this step.""" 

549 sharding_dimensions: list[str] 

550 """The dimensions to use when sharding this step.""" 

551 

552 

553class PipelineIR: 

554 """Intermediate representation of a pipeline definition. 

555 

556 Parameters 

557 ---------- 

558 loaded_yaml : `dict` 

559 A dictionary which matches the structure that would be produced by a 

560 yaml reader which parses a pipeline definition document. 

561 

562 Raises 

563 ------ 

564 ValueError 

565 Raised if: 

566 

567 - a pipeline is declared without a description; 

568 - no tasks are declared in a pipeline, and no pipelines are to be 

569 inherited; 

570 - more than one instrument is specified; 

571 - more than one inherited pipeline share a label. 

572 """ 

573 

574 def __init__(self, loaded_yaml: dict[str, Any]): 

575 # Check required fields are present 

576 if "description" not in loaded_yaml: 

577 raise ValueError("A pipeline must be declared with a description") 

578 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2: 

579 raise ValueError("A pipeline must be declared with one or more tasks") 

580 

581 # These steps below must happen in this call order 

582 

583 # Process pipeline description 

584 self.description = loaded_yaml.pop("description") 

585 

586 # Process tasks 

587 self._read_tasks(loaded_yaml) 

588 

589 # Process instrument keys 

590 inst = loaded_yaml.pop("instrument", None) 

591 if isinstance(inst, list): 

592 raise ValueError("Only one top level instrument can be defined in a pipeline") 

593 self.instrument: str | None = inst 

594 

595 # Process any contracts 

596 self._read_contracts(loaded_yaml) 

597 

598 # Process any defined parameters 

599 self._read_parameters(loaded_yaml) 

600 

601 # Process any named label subsets 

602 self._read_labeled_subsets(loaded_yaml) 

603 

604 # Process defined sets 

605 self._read_step_declaration(loaded_yaml) 

606 

607 # Process any inherited pipelines 

608 self._read_imports(loaded_yaml) 

609 

610 # verify named subsets, must be done after inheriting 

611 self._verify_labeled_subsets() 

612 

613 # verify steps, must be done after inheriting 

614 self._verify_steps() 

615 

616 def _read_contracts(self, loaded_yaml: dict[str, Any]) -> None: 

617 """Process the contracts portion of the loaded yaml document 

618 

619 Parameters 

620 ---------- 

621 loaded_yaml : `dict` 

622 A dictionary which matches the structure that would be produced by 

623 a yaml reader which parses a pipeline definition document 

624 """ 

625 loaded_contracts = loaded_yaml.pop("contracts", []) 

626 if isinstance(loaded_contracts, str): 

627 loaded_contracts = [loaded_contracts] 

628 self.contracts: list[ContractIR] = [] 

629 for contract in loaded_contracts: 

630 if isinstance(contract, dict): 

631 self.contracts.append(ContractIR(**contract)) 

632 if isinstance(contract, str): 

633 self.contracts.append(ContractIR(contract=contract)) 

634 

635 def _read_parameters(self, loaded_yaml: dict[str, Any]) -> None: 

636 """Process the parameters portion of the loaded yaml document 

637 

638 Parameters 

639 ---------- 

640 loaded_yaml : `dict` 

641 A dictionary which matches the structure that would be produced by 

642 a yaml reader which parses a pipeline definition document 

643 """ 

644 loaded_parameters = loaded_yaml.pop("parameters", {}) 

645 if not isinstance(loaded_parameters, dict): 

646 raise ValueError("The parameters section must be a yaml mapping") 

647 self.parameters = ParametersIR(loaded_parameters) 

648 

649 def _read_labeled_subsets(self, loaded_yaml: dict[str, Any]) -> None: 

650 """Process the subsets portion of the loaded yaml document 

651 

652 Parameters 

653 ---------- 

654 loaded_yaml : `MutableMapping` 

655 A dictionary which matches the structure that would be produced 

656 by a yaml reader which parses a pipeline definition document 

657 """ 

658 loaded_subsets = loaded_yaml.pop("subsets", {}) 

659 self.labeled_subsets: dict[str, LabeledSubset] = {} 

660 if not loaded_subsets and "subset" in loaded_yaml: 

661 raise ValueError("Top level key should be subsets and not subset, add an s") 

662 for key, value in loaded_subsets.items(): 

663 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value) 

664 

665 def _read_step_declaration(self, loaded_yaml: dict[str, Any]) -> None: 

666 """Process the steps portion of the loaded yaml document 

667 

668 Steps are subsets that are declared to be normal parts of the overall 

669 processing of the pipeline. Not all subsets need to be a step, as they 

670 can exist for certain targeted processing, such as debugging. 

671 

672 Parameters 

673 ---------- 

674 loaded_yaml: `dict` 

675 A dictionary which matches the structure that would be produced 

676 by a yaml reader which parses a pipeline definition document 

677 """ 

678 loaded_steps = loaded_yaml.pop("steps", []) 

679 temp_steps: dict[str, StepIR] = {} 

680 for declaration in loaded_steps: 

681 new_step = StepIR(**declaration) 

682 existing = temp_steps.setdefault(new_step.label, new_step) 

683 if existing is not new_step: 

684 raise ValueError(f"Step {existing.label} was declared twice.") 

685 self.steps = [step for step in temp_steps.values()] 

686 

687 def _verify_labeled_subsets(self) -> None: 

688 """Verify that all the labels in each named subset exist within the 

689 pipeline. 

690 """ 

691 # Verify that all labels defined in a labeled subset are in the 

692 # Pipeline 

693 for labeled_subset in self.labeled_subsets.values(): 

694 if not labeled_subset.subset.issubset(self.tasks.keys()): 

695 raise ValueError( 

696 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the " 

697 "declared pipeline" 

698 ) 

699 # Verify subset labels are not already task labels 

700 label_intersection = self.labeled_subsets.keys() & self.tasks.keys() 

701 if label_intersection: 

702 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}") 

703 

704 def _verify_steps(self) -> None: 

705 """Verify that all step definitions have a corresponding labeled 

706 subset. 

707 """ 

708 for step in self.steps: 

709 if step.label not in self.labeled_subsets: 

710 raise ValueError( 

711 f"{step.label} was declared to be a step, but was not declared to be a labeled subset" 

712 ) 

713 

714 def _read_imports(self, loaded_yaml: dict[str, Any]) -> None: 

715 """Process the inherits portion of the loaded yaml document 

716 

717 Parameters 

718 ---------- 

719 loaded_yaml : `dict` 

720 A dictionary which matches the structure that would be produced by 

721 a yaml reader which parses a pipeline definition document 

722 """ 

723 

724 def process_args(argument: str | dict) -> dict: 

725 if isinstance(argument, str): 

726 return {"location": argument} 

727 elif isinstance(argument, dict): 

728 if "exclude" in argument and isinstance(argument["exclude"], str): 

729 argument["exclude"] = [argument["exclude"]] 

730 if "include" in argument and isinstance(argument["include"], str): 

731 argument["include"] = [argument["include"]] 

732 if "instrument" in argument and argument["instrument"] == "None": 

733 argument["instrument"] = None 

734 if "labeledSubsetModifyMode" in argument: 

735 match argument["labeledSubsetModifyMode"]: 

736 case "DROP": 

737 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.DROP 

738 case "EDIT": 

739 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.EDIT 

740 case unknown: 

741 raise ValueError(f"{unknown} is not a valid mode for labeledSubsetModifyMode") 

742 return argument 

743 

744 if not {"inherits", "imports"} - loaded_yaml.keys(): 

745 raise ValueError("Cannot define both inherits and imports sections, use imports") 

746 tmp_import = loaded_yaml.pop("inherits", None) 

747 if tmp_import is None: 

748 tmp_import = loaded_yaml.pop("imports", None) 

749 else: 

750 raise ValueError("The 'inherits' key is not supported. Please use the key 'imports' instead") 

751 if tmp_import is None: 

752 self.imports: list[ImportIR] = [] 

753 elif isinstance(tmp_import, list): 

754 self.imports = [ImportIR(**process_args(args)) for args in tmp_import] 

755 else: 

756 self.imports = [ImportIR(**process_args(tmp_import))] 

757 

758 self.merge_pipelines([fragment.toPipelineIR() for fragment in self.imports]) 

759 

760 def merge_pipelines(self, pipelines: Iterable[PipelineIR]) -> None: 

761 """Merge one or more other `PipelineIR` objects into this object. 

762 

763 Parameters 

764 ---------- 

765 pipelines : `~collections.abc.Iterable` of `PipelineIR` objects 

766 An `~collections.abc.Iterable` that contains one or more 

767 `PipelineIR` objects to merge into this object. 

768 

769 Raises 

770 ------ 

771 ValueError 

772 Raised if there is a conflict in instrument specifications. 

773 Raised if a task label appears in more than one of the input 

774 `PipelineIR` objects which are to be merged. 

775 Raised if a labeled subset appears in more than one of the input 

776 `PipelineIR` objects which are to be merged, and with any subset 

777 existing in this object. 

778 """ 

779 # integrate any imported pipelines 

780 accumulate_tasks: dict[str, TaskIR] = {} 

781 accumulate_labeled_subsets: dict[str, LabeledSubset] = {} 

782 accumulated_parameters = ParametersIR({}) 

783 accumulated_steps: dict[str, StepIR] = {} 

784 

785 for tmp_IR in pipelines: 

786 if self.instrument is None: 

787 self.instrument = tmp_IR.instrument 

788 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None: 

789 msg = ( 

790 "Only one instrument can be declared in a pipeline or its imports. " 

791 f"Top level pipeline defines {self.instrument} but pipeline to merge " 

792 f"defines {tmp_IR.instrument}." 

793 ) 

794 raise ValueError(msg) 

795 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

796 msg = ( 

797 "Task labels in the imported pipelines must be unique. " 

798 f"These labels appear multiple times: {duplicate_labels}" 

799 ) 

800 raise ValueError(msg) 

801 accumulate_tasks.update(tmp_IR.tasks) 

802 self.contracts.extend(tmp_IR.contracts) 

803 # verify that tmp_IR has unique labels for named subset among 

804 # existing labeled subsets, and with existing task labels. 

805 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys() 

806 task_subset_overlap = ( 

807 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys() 

808 ) & accumulate_tasks.keys() 

809 if overlapping_subsets or task_subset_overlap: 

810 raise ValueError( 

811 "Labeled subset names must be unique amongst imports in both labels and " 

812 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}" 

813 ) 

814 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets) 

815 accumulated_parameters.update(tmp_IR.parameters) 

816 for tmp_step in tmp_IR.steps: 

817 existing = accumulated_steps.setdefault(tmp_step.label, tmp_step) 

818 if existing != tmp_step: 

819 raise ValueError( 

820 f"There were conflicting step definitions in import {tmp_step}, {existing}" 

821 ) 

822 

823 for tmp_step in self.steps: 

824 existing = accumulated_steps.setdefault(tmp_step.label, tmp_step) 

825 if existing != tmp_step: 

826 raise ValueError(f"There were conflicting step definitions in import {tmp_step}, {existing}") 

827 

828 # verify that any accumulated labeled subsets dont clash with a label 

829 # from this pipeline 

830 if accumulate_labeled_subsets.keys() & self.tasks.keys(): 

831 raise ValueError( 

832 "Labeled subset names must be unique amongst imports in both labels and named Subsets" 

833 ) 

834 # merge in the named subsets for self so this document can override any 

835 # that have been delcared 

836 accumulate_labeled_subsets.update(self.labeled_subsets) 

837 self.labeled_subsets = accumulate_labeled_subsets 

838 

839 # merge the dict of label:TaskIR objects, preserving any configs in the 

840 # imported pipeline if the labels point to the same class 

841 for label, task in self.tasks.items(): 

842 if label not in accumulate_tasks: 

843 accumulate_tasks[label] = task 

844 elif accumulate_tasks[label].klass == task.klass: 

845 if task.config is not None: 

846 for config in task.config: 

847 accumulate_tasks[label].add_or_update_config(config) 

848 else: 

849 accumulate_tasks[label] = task 

850 self.tasks: dict[str, TaskIR] = accumulate_tasks 

851 accumulated_parameters.update(self.parameters) 

852 self.parameters = accumulated_parameters 

853 self.steps = list(accumulated_steps.values()) 

854 

855 def _read_tasks(self, loaded_yaml: dict[str, Any]) -> None: 

856 """Process the tasks portion of the loaded yaml document 

857 

858 Parameters 

859 ---------- 

860 loaded_yaml : `dict` 

861 A dictionary which matches the structure that would be produced by 

862 a yaml reader which parses a pipeline definition document 

863 """ 

864 self.tasks = {} 

865 tmp_tasks = loaded_yaml.pop("tasks", None) 

866 if tmp_tasks is None: 

867 tmp_tasks = {} 

868 

869 if "parameters" in tmp_tasks: 

870 raise ValueError("parameters is a reserved word and cannot be used as a task label") 

871 

872 for label, definition in tmp_tasks.items(): 

873 if isinstance(definition, str): 

874 definition = {"class": definition} 

875 config = definition.get("config", None) 

876 if config is None: 

877 task_config_ir = None 

878 else: 

879 if isinstance(config, dict): 

880 config = [config] 

881 task_config_ir = [] 

882 for c in config: 

883 file = c.pop("file", None) 

884 if file is None: 

885 file = [] 

886 elif not isinstance(file, list): 

887 file = [file] 

888 task_config_ir.append( 

889 ConfigIR( 

890 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c 

891 ) 

892 ) 

893 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

894 

895 def _remove_contracts(self, label: str) -> None: 

896 """Remove any contracts that contain the given label 

897 

898 String comparison used in this way is not the most elegant and may 

899 have issues, but it is the only feasible way when users can specify 

900 contracts with generic strings. 

901 """ 

902 new_contracts = [] 

903 for contract in self.contracts: 

904 # match a label that is not preceded by an ASCII identifier, or 

905 # is the start of a line and is followed by a dot 

906 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract): 

907 continue 

908 new_contracts.append(contract) 

909 self.contracts = new_contracts 

910 

911 def subset_from_labels( 

912 self, labelSpecifier: set[str], subsetCtrl: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP 

913 ) -> PipelineIR: 

914 """Subset a pipelineIR to contain only labels specified in 

915 labelSpecifier. 

916 

917 Parameters 

918 ---------- 

919 labelSpecifier : `set` of `str` 

920 Set containing labels that describes how to subset a pipeline. 

921 subsetCtrl : `PipelineSubsetCtrl` 

922 Control object which decides how subsets with missing labels are 

923 handled. Setting to `PipelineSubsetCtrl.DROP` (the default) will 

924 cause any subsets that have labels which are not in the set of all 

925 task labels to be dropped. Setting to `PipelineSubsetCtrl.EDIT` 

926 will cause the subset to instead be edited to remove the 

927 nonexistent label. 

928 

929 Returns 

930 ------- 

931 pipeline : `PipelineIR` 

932 A new pipelineIR object that is a subset of the old pipelineIR. 

933 

934 Raises 

935 ------ 

936 ValueError 

937 Raised if there is an issue with specified labels. 

938 

939 Notes 

940 ----- 

941 This method attempts to prune any contracts that contain labels which 

942 are not in the declared subset of labels. This pruning is done using a 

943 string based matching due to the nature of contracts and may prune more 

944 than it should. 

945 """ 

946 pipeline = copy.deepcopy(self) 

947 

948 # update the label specifier to expand any named subsets 

949 toRemove = set() 

950 toAdd = set() 

951 for label in labelSpecifier: 

952 if label in pipeline.labeled_subsets: 

953 toRemove.add(label) 

954 toAdd.update(pipeline.labeled_subsets[label].subset) 

955 labelSpecifier.difference_update(toRemove) 

956 labelSpecifier.update(toAdd) 

957 # verify all the labels are in the pipeline 

958 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets): 

959 difference = labelSpecifier.difference(pipeline.tasks.keys()) 

960 raise ValueError( 

961 "Not all supplied labels (specified or named subsets) are in the pipeline " 

962 f"definition, extra labels: {difference}" 

963 ) 

964 # copy needed so as to not modify while iterating 

965 pipeline_labels = set(pipeline.tasks.keys()) 

966 # Remove the labels from the pipelineIR, and any contracts that contain 

967 # those labels (see docstring on _remove_contracts for why this may 

968 # cause issues) 

969 for label in pipeline_labels: 

970 if label not in labelSpecifier: 

971 pipeline.tasks.pop(label) 

972 pipeline._remove_contracts(label) 

973 

974 # create a copy of the object to iterate over 

975 labeled_subsets = copy.copy(pipeline.labeled_subsets) 

976 # remove or edit any labeled subsets that no longer have a complete set 

977 for label, labeled_subset in labeled_subsets.items(): 

978 if extraTaskLabels := (labeled_subset.subset - pipeline.tasks.keys()): 

979 match subsetCtrl: 

980 case PipelineSubsetCtrl.DROP: 

981 pipeline.labeled_subsets.pop(label) 

982 case PipelineSubsetCtrl.EDIT: 

983 for extra in extraTaskLabels: 

984 labeled_subset.subset.discard(extra) 

985 

986 # remove any steps that correspond to removed subsets 

987 new_steps = [] 

988 for step in pipeline.steps: 

989 if step.label not in pipeline.labeled_subsets: 

990 continue 

991 new_steps.append(step) 

992 pipeline.steps = new_steps 

993 

994 return pipeline 

995 

996 @classmethod 

997 def from_string(cls, pipeline_string: str) -> PipelineIR: 

998 """Create a `PipelineIR` object from a string formatted like a pipeline 

999 document. 

1000 

1001 Parameters 

1002 ---------- 

1003 pipeline_string : `str` 

1004 A string that is formatted according like a pipeline document. 

1005 """ 

1006 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

1007 return cls(loaded_yaml) 

1008 

1009 @classmethod 

1010 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR: 

1011 """Create a `PipelineIR` object from the document specified by the 

1012 input uri. 

1013 

1014 Parameters 

1015 ---------- 

1016 uri : convertible to `~lsst.resources.ResourcePath` 

1017 Location of document to use in creating a `PipelineIR` object. 

1018 

1019 Returns 

1020 ------- 

1021 pipelineIR : `PipelineIR` 

1022 The loaded pipeline. 

1023 """ 

1024 loaded_uri = ResourcePath(uri) 

1025 with loaded_uri.open("r") as buffer: 

1026 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader) 

1027 return cls(loaded_yaml) 

1028 

1029 def write_to_uri(self, uri: ResourcePathExpression) -> None: 

1030 """Serialize this `PipelineIR` object into a yaml formatted string and 

1031 write the output to a file at the specified uri. 

1032 

1033 Parameters 

1034 ---------- 

1035 uri : convertible to `~lsst.resources.ResourcePath` 

1036 Location of document to write a `PipelineIR` object. 

1037 """ 

1038 with ResourcePath(uri).open("w") as buffer: 

1039 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper) 

1040 

1041 def to_primitives(self) -> dict[str, Any]: 

1042 """Convert to a representation used in yaml serialization. 

1043 

1044 Returns 

1045 ------- 

1046 primitives : `dict` 

1047 Dictionary that maps directly to the serialized YAML form. 

1048 """ 

1049 accumulate = {"description": self.description} 

1050 if self.instrument is not None: 

1051 accumulate["instrument"] = self.instrument 

1052 if self.parameters: 

1053 accumulate["parameters"] = self.parameters.to_primitives() 

1054 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()} 

1055 if len(self.contracts) > 0: 

1056 # sort contracts lexicographical order by the contract string in 

1057 # absence of any other ordering principle 

1058 contracts_list = [c.to_primitives() for c in self.contracts] 

1059 contracts_list.sort(key=lambda x: x["contract"]) 

1060 accumulate["contracts"] = contracts_list 

1061 if self.labeled_subsets: 

1062 accumulate["subsets"] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()} 

1063 return accumulate 

1064 

1065 def __str__(self) -> str: 

1066 """Instance formatting as how it would look in yaml representation""" 

1067 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper) 

1068 

1069 def __repr__(self) -> str: 

1070 """Instance formatting as how it would look in yaml representation""" 

1071 return str(self) 

1072 

1073 def __eq__(self, other: object) -> bool: 

1074 if not isinstance(other, PipelineIR): 

1075 return False 

1076 # special case contracts because it is a list, but order is not 

1077 # important 

1078 return ( 

1079 all( 

1080 getattr(self, attr) == getattr(other, attr) 

1081 for attr in ("tasks", "instrument", "labeled_subsets", "parameters") 

1082 ) 

1083 and len(self.contracts) == len(other.contracts) 

1084 and all(c in self.contracts for c in other.contracts) 

1085 )