Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1__all__ = ("ConfigIR", "ContractError", "ContractIR", "InheritIR", "PipelineIR", "TaskIR") 

2# This file is part of pipe_base. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22 

23from collections import Counter 

24from dataclasses import dataclass, field 

25from typing import List, Union, Generator 

26 

27import os 

28import yaml 

29import warnings 

30 

31 

32class PipelineYamlLoader(yaml.SafeLoader): 

33 """This is a specialized version of yaml's SafeLoader. It checks and raises 

34 an exception if it finds that there are multiple instances of the same key 

35 found inside a pipeline file at a given scope. 

36 """ 

37 def construct_mapping(self, node, deep=False): 

38 # do the call to super first so that it can do all the other forms of 

39 # checking on this node. If you check the uniqueness of keys first 

40 # it would save the work that super does in the case of a failure, but 

41 # it might fail in the case that the node was the incorrect node due 

42 # to a parsing error, and the resulting exception would be difficult to 

43 # understand. 

44 mapping = super().construct_mapping(node, deep) 

45 # Check if there are any duplicate keys 

46 all_keys = Counter(key_node.value for key_node, _ in node.value) 

47 duplicates = {k for k, i in all_keys.items() if i != 1} 

48 if duplicates: 

49 raise KeyError("Pipeline files must not have duplicated keys, " 

50 f"{duplicates} appeared multiple times") 

51 return mapping 

52 

53 

54class ContractError(Exception): 

55 """An exception that is raised when a pipeline contract is not satisfied 

56 """ 

57 pass 

58 

59 

60@dataclass 

61class ContractIR: 

62 """Intermediate representation of contracts read from a pipeline yaml file. 

63 """ 

64 contract: str 

65 """A string of python code representing one or more conditions on configs 

66 in a pipeline. This code-as-string should, once evaluated, should be True 

67 if the configs are fine, and False otherwise. 

68 """ 

69 msg: Union[str, None] = None 

70 """An optional message to be shown to the user if a contract fails 

71 """ 

72 

73 def to_primitives(self) -> dict: 

74 """Convert to a representation used in yaml serialization 

75 """ 

76 accumulate = {"contract": self.contract} 

77 if self.msg is not None: 

78 accumulate['msg'] = self.msg 

79 return accumulate 

80 

81 def __eq__(self, other: "ContractIR"): 

82 if not isinstance(other, ContractIR): 

83 return False 

84 elif self.contract == other.contract and self.msg == other.msg: 

85 return True 

86 else: 

87 return False 

88 

89 

90@dataclass 

91class ConfigIR: 

92 """Intermediate representation of configurations read from a pipeline yaml 

93 file. 

94 """ 

95 python: Union[str, None] = None 

96 """A string of python code that is used to modify a configuration. This can 

97 also be None if there are no modifications to do. 

98 """ 

99 dataId: Union[dict, None] = None 

100 """A dataId that is used to constrain these config overrides to only quanta 

101 with matching dataIds. This field can be None if there is no constraint. 

102 This is currently an unimplemented feature, and is placed here for future 

103 use. 

104 """ 

105 file: List[str] = field(default_factory=list) 

106 """A list of paths which points to a file containing config overrides to be 

107 applied. This value may be an empty list if there are no overrides to 

108 apply. 

109 """ 

110 rest: dict = field(default_factory=dict) 

111 """This is a dictionary of key value pairs, where the keys are strings 

112 corresponding to qualified fields on a config to override, and the values 

113 are strings representing the values to apply. 

114 """ 

115 

116 def to_primitives(self) -> dict: 

117 """Convert to a representation used in yaml serialization 

118 """ 

119 accumulate = {} 

120 for name in ("python", "dataId", "file"): 

121 # if this attribute is thruthy add it to the accumulation 

122 # dictionary 

123 if getattr(self, name): 

124 accumulate[name] = getattr(self, name) 

125 # Add the dictionary containing the rest of the config keys to the 

126 # # accumulated dictionary 

127 accumulate.update(self.rest) 

128 return accumulate 

129 

130 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]: 

131 """Merges another instance of a `ConfigIR` into this instance if 

132 possible. This function returns a generator that is either self 

133 if the configs were merged, or self, and other_config if that could 

134 not be merged. 

135 

136 Parameters 

137 ---------- 

138 other_config : `ConfigIR` 

139 An instance of `ConfigIR` to merge into this instance. 

140 

141 Returns 

142 ------- 

143 Generator : `ConfigIR` 

144 A generator containing either self, or self and other_config if 

145 the configs could be merged or not respectively. 

146 """ 

147 # Verify that the config blocks can be merged 

148 if self.dataId != other_config.dataId or self.python or other_config.python or\ 

149 self.file or other_config.file: 

150 yield from (self, other_config) 

151 return 

152 

153 # create a set of all keys, and verify two keys do not have different 

154 # values 

155 key_union = self.rest.keys() & other_config.rest.keys() 

156 for key in key_union: 

157 if self.rest[key] != other_config.rest[key]: 

158 yield from (self, other_config) 

159 return 

160 self.rest.update(other_config.rest) 

161 

162 # Combine the lists of override files to load 

163 self_file_set = set(self.file) 

164 other_file_set = set(other_config.file) 

165 self.file = list(self_file_set.union(other_file_set)) 

166 

167 yield self 

168 

169 def __eq__(self, other: "ConfigIR"): 

170 if not isinstance(other, ConfigIR): 

171 return False 

172 elif all(getattr(self, attr) == getattr(other, attr) for attr in 

173 ("python", "dataId", "file", "rest")): 

174 return True 

175 else: 

176 return False 

177 

178 

179@dataclass 

180class TaskIR: 

181 """Intermediate representation of tasks read from a pipeline yaml file. 

182 """ 

183 label: str 

184 """An identifier used to refer to a task. 

185 """ 

186 klass: str 

187 """A string containing a fully qualified python class to be run in a 

188 pipeline. 

189 """ 

190 config: Union[List[ConfigIR], None] = None 

191 """List of all configs overrides associated with this task, and may be 

192 `None` if there are no config overrides. 

193 """ 

194 

195 def to_primitives(self) -> dict: 

196 """Convert to a representation used in yaml serialization 

197 """ 

198 accumulate = {'class': self.klass} 

199 if self.config: 

200 accumulate['config'] = [c.to_primitives() for c in self.config] 

201 return accumulate 

202 

203 def add_or_update_config(self, other_config: ConfigIR): 

204 """Adds a `ConfigIR` to this task if one is not present. Merges configs 

205 if there is a `ConfigIR` present and the dataId keys of both configs 

206 match, otherwise adds a new entry to the config list. The exception to 

207 the above is that if either the last config or other_config has a 

208 python block, then other_config is always added, as python blocks can 

209 modify configs in ways that cannot be predicted. 

210 

211 Parameters 

212 ---------- 

213 other_config : `ConfigIR` 

214 A `ConfigIR` instance to add or merge into the config attribute of 

215 this task. 

216 """ 

217 if not self.config: 

218 self.config = [other_config] 

219 return 

220 self.config.extend(self.config.pop().maybe_merge(other_config)) 

221 

222 def __eq__(self, other: "TaskIR"): 

223 if not isinstance(other, TaskIR): 

224 return False 

225 elif all(getattr(self, attr) == getattr(other, attr) for attr in 

226 ("label", "klass", "config")): 

227 return True 

228 else: 

229 return False 

230 

231 

232@dataclass 

233class InheritIR: 

234 """An intermediate representation of inherited pipelines 

235 """ 

236 location: str 

237 """This is the location of the pipeline to inherit. The path should be 

238 specified as an absolute path. Environment variables may be used in the 

239 path and should be specified as a python string template, with the name of 

240 the environment variable inside braces. 

241 """ 

242 include: Union[List[str], None] = None 

243 """List of tasks that should be included when inheriting this pipeline. 

244 Either the include or exclude attributes may be specified, but not both. 

245 """ 

246 exclude: Union[List[str], None] = None 

247 """List of tasks that should be excluded when inheriting this pipeline. 

248 Either the include or exclude attributes may be specified, but not both. 

249 """ 

250 importContracts: bool = True 

251 """Boolean attribute to dictate if contracts should be inherited with the 

252 pipeline or not. 

253 """ 

254 

255 def toPipelineIR(self) -> "PipelineIR": 

256 """Convert to a representation used in yaml serialization 

257 """ 

258 if self.include and self.exclude: 

259 raise ValueError("Both an include and an exclude list cant be specified" 

260 " when declaring a pipeline import") 

261 tmp_pipeline = PipelineIR.from_file(os.path.expandvars(self.location)) 

262 if tmp_pipeline.instrument is not None: 

263 warnings.warn("Any instrument definitions in imported pipelines are ignored. " 

264 "if an instrument is desired please define it in the top most pipeline") 

265 

266 new_tasks = {} 

267 for label, task in tmp_pipeline.tasks.items(): 

268 if (self.include and label in self.include) or (self.exclude and label not in self.exclude)\ 

269 or (self.include is None and self.exclude is None): 

270 new_tasks[label] = task 

271 tmp_pipeline.tasks = new_tasks 

272 

273 if not self.importContracts: 

274 tmp_pipeline.contracts = [] 

275 

276 return tmp_pipeline 

277 

278 def __eq__(self, other: "InheritIR"): 

279 if not isinstance(other, InheritIR): 

280 return False 

281 elif all(getattr(self, attr) == getattr(other, attr) for attr in 

282 ("location", "include", "exclude", "importContracts")): 

283 return True 

284 else: 

285 return False 

286 

287 

288class PipelineIR: 

289 """Intermediate representation of a pipeline definition 

290 

291 Parameters 

292 ---------- 

293 loaded_yaml : `dict` 

294 A dictionary which matches the structure that would be produced by a 

295 yaml reader which parses a pipeline definition document 

296 

297 Raises 

298 ------ 

299 ValueError : 

300 - If a pipeline is declared without a description 

301 - If no tasks are declared in a pipeline, and no pipelines are to be 

302 inherited 

303 - If more than one instrument is specified 

304 - If more than one inherited pipeline share a label 

305 """ 

306 def __init__(self, loaded_yaml): 

307 # Check required fields are present 

308 if "description" not in loaded_yaml: 

309 raise ValueError("A pipeline must be declared with a description") 

310 if "tasks" not in loaded_yaml and "inherits" not in loaded_yaml: 

311 raise ValueError("A pipeline must be declared with one or more tasks") 

312 

313 # Process pipeline description 

314 self.description = loaded_yaml.pop("description") 

315 

316 # Process tasks 

317 self._read_tasks(loaded_yaml) 

318 

319 # Process instrument keys 

320 inst = loaded_yaml.pop("instrument", None) 

321 if isinstance(inst, list): 

322 raise ValueError("Only one top level instrument can be defined in a pipeline") 

323 self.instrument = inst 

324 

325 # Process any contracts 

326 self._read_contracts(loaded_yaml) 

327 

328 # Process any inherited pipelines 

329 self._read_inherits(loaded_yaml) 

330 

331 def _read_contracts(self, loaded_yaml): 

332 """Process the contracts portion of the loaded yaml document 

333 

334 Parameters 

335 --------- 

336 loaded_yaml : `dict` 

337 A dictionary which matches the structure that would be produced by 

338 a yaml reader which parses a pipeline definition document 

339 """ 

340 loaded_contracts = loaded_yaml.pop("contracts", []) 

341 if isinstance(loaded_contracts, str): 

342 loaded_contracts = [loaded_contracts] 

343 self.contracts = [] 

344 for contract in loaded_contracts: 

345 if isinstance(contract, dict): 

346 self.contracts.append(ContractIR(**contract)) 

347 if isinstance(contract, str): 

348 self.contracts.append(ContractIR(contract=contract)) 

349 

350 def _read_inherits(self, loaded_yaml): 

351 """Process the inherits portion of the loaded yaml document 

352 

353 Parameters 

354 --------- 

355 loaded_yaml : `dict` 

356 A dictionary which matches the structure that would be produced by 

357 a yaml reader which parses a pipeline definition document 

358 """ 

359 def process_args(argument: Union[str, dict]) -> dict: 

360 if isinstance(argument, str): 

361 return {"location": argument} 

362 elif isinstance(argument, dict): 

363 if "exclude" in argument and isinstance(argument["exclude"], str): 

364 argument["exclude"] = [argument["exclude"]] 

365 if "include" in argument and isinstance(argument["include"], str): 

366 argument["include"] = [argument["include"]] 

367 return argument 

368 tmp_inherit = loaded_yaml.pop("inherits", None) 

369 if tmp_inherit is None: 

370 self.inherits = [] 

371 elif isinstance(tmp_inherit, list): 

372 self.inherits = [InheritIR(**process_args(args)) for args in tmp_inherit] 

373 else: 

374 self.inherits = [InheritIR(**process_args(tmp_inherit))] 

375 

376 # integrate any imported pipelines 

377 accumulate_tasks = {} 

378 for other_pipeline in self.inherits: 

379 tmp_IR = other_pipeline.toPipelineIR() 

380 if accumulate_tasks.keys() & tmp_IR.tasks.keys(): 

381 raise ValueError("Task labels in the imported pipelines must " 

382 "be unique") 

383 accumulate_tasks.update(tmp_IR.tasks) 

384 self.contracts.extend(tmp_IR.contracts) 

385 

386 # merge the dict of label:TaskIR objects, preserving any configs in the 

387 # imported pipeline if the labels point to the same class 

388 for label, task in self.tasks.items(): 

389 if label not in accumulate_tasks: 

390 accumulate_tasks[label] = task 

391 elif accumulate_tasks[label].klass == task.klass: 

392 if task.config is not None: 

393 for config in task.config: 

394 accumulate_tasks[label].add_or_update_config(config) 

395 else: 

396 accumulate_tasks[label] = task 

397 self.tasks = accumulate_tasks 

398 

399 def _read_tasks(self, loaded_yaml): 

400 """Process the tasks portion of the loaded yaml document 

401 

402 Parameters 

403 --------- 

404 loaded_yaml : `dict` 

405 A dictionary which matches the structure that would be produced by 

406 a yaml reader which parses a pipeline definition document 

407 """ 

408 self.tasks = {} 

409 tmp_tasks = loaded_yaml.pop("tasks", None) 

410 if tmp_tasks is None: 

411 tmp_tasks = {} 

412 

413 for label, definition in tmp_tasks.items(): 

414 if isinstance(definition, str): 

415 definition = {"class": definition} 

416 config = definition.get('config', None) 

417 if config is None: 

418 task_config_ir = None 

419 else: 

420 if isinstance(config, dict): 

421 config = [config] 

422 task_config_ir = [] 

423 for c in config: 

424 file = c.pop("file", None) 

425 if file is None: 

426 file = [] 

427 elif not isinstance(file, list): 

428 file = [file] 

429 task_config_ir.append(ConfigIR(python=c.pop("python", None), 

430 dataId=c.pop("dataId", None), 

431 file=file, 

432 rest=c)) 

433 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir) 

434 

435 @classmethod 

436 def from_string(cls, pipeline_string: str): 

437 """Create a `PipelineIR` object from a string formatted like a pipeline 

438 document 

439 

440 Parameters 

441 ---------- 

442 pipeline_string : `str` 

443 A string that is formatted according like a pipeline document 

444 """ 

445 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader) 

446 return cls(loaded_yaml) 

447 

448 @classmethod 

449 def from_file(cls, filename: str): 

450 """Create a `PipelineIR` object from the document specified by the 

451 input path. 

452 

453 Parameters 

454 ---------- 

455 filename : `str` 

456 Location of document to use in creating a `PipelineIR` object. 

457 """ 

458 with open(filename, 'r') as f: 

459 loaded_yaml = yaml.load(f, Loader=PipelineYamlLoader) 

460 return cls(loaded_yaml) 

461 

462 def to_file(self, filename: str): 

463 """Serialize this `PipelineIR` object into a yaml formatted string and 

464 write the output to a file at the specified path. 

465 

466 Parameters 

467 ---------- 

468 filename : `str` 

469 Location of document to write a `PipelineIR` object. 

470 """ 

471 with open(filename, 'w') as f: 

472 yaml.dump(self.to_primitives(), f, sort_keys=False) 

473 

474 def to_primitives(self): 

475 """Convert to a representation used in yaml serialization 

476 """ 

477 accumulate = {"description": self.description} 

478 if self.instrument is not None: 

479 accumulate['instrument'] = self.instrument 

480 accumulate['tasks'] = {m: t.to_primitives() for m, t in self.tasks.items()} 

481 if len(self.contracts) > 0: 

482 accumulate['contracts'] = [c.to_primitives() for c in self.contracts] 

483 return accumulate 

484 

485 def __str__(self) -> str: 

486 """Instance formatting as how it would look in yaml representation 

487 """ 

488 return yaml.dump(self.to_primitives(), sort_keys=False) 

489 

490 def __repr__(self) -> str: 

491 """Instance formatting as how it would look in yaml representation 

492 """ 

493 return str(self) 

494 

495 def __eq__(self, other: "PipelineIR"): 

496 if not isinstance(other, PipelineIR): 

497 return False 

498 elif all(getattr(self, attr) == getattr(other, attr) for attr in 

499 ("contracts", "tasks", "instrument")): 

500 return True 

501 else: 

502 return False