Coverage for python/lsst/pipe/base/pipelineIR.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1__all__ = ("ConfigIR", "ContractError", "ContractIR", "InheritIR", "PipelineIR", "TaskIR")
2# This file is part of pipe_base.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
23from collections import Counter
24from dataclasses import dataclass, field
25from typing import List, Union, Generator
27import os
28import yaml
29import warnings
32class PipelineYamlLoader(yaml.SafeLoader):
33 """This is a specialized version of yaml's SafeLoader. It checks and raises
34 an exception if it finds that there are multiple instances of the same key
35 found inside a pipeline file at a given scope.
36 """
37 def construct_mapping(self, node, deep=False):
38 # do the call to super first so that it can do all the other forms of
39 # checking on this node. If you check the uniqueness of keys first
40 # it would save the work that super does in the case of a failure, but
41 # it might fail in the case that the node was the incorrect node due
42 # to a parsing error, and the resulting exception would be difficult to
43 # understand.
44 mapping = super().construct_mapping(node, deep)
45 # Check if there are any duplicate keys
46 all_keys = Counter(key_node.value for key_node, _ in node.value)
47 duplicates = {k for k, i in all_keys.items() if i != 1}
48 if duplicates:
49 raise KeyError("Pipeline files must not have duplicated keys, "
50 f"{duplicates} appeared multiple times")
51 return mapping
54class ContractError(Exception):
55 """An exception that is raised when a pipeline contract is not satisfied
56 """
57 pass
60@dataclass
61class ContractIR:
62 """Intermediate representation of contracts read from a pipeline yaml file.
63 """
64 contract: str
65 """A string of python code representing one or more conditions on configs
66 in a pipeline. This code-as-string should, once evaluated, should be True
67 if the configs are fine, and False otherwise.
68 """
69 msg: Union[str, None] = None
70 """An optional message to be shown to the user if a contract fails
71 """
73 def to_primitives(self) -> dict:
74 """Convert to a representation used in yaml serialization
75 """
76 accumulate = {"contract": self.contract}
77 if self.msg is not None:
78 accumulate['msg'] = self.msg
79 return accumulate
81 def __eq__(self, other: "ContractIR"):
82 if not isinstance(other, ContractIR):
83 return False
84 elif self.contract == other.contract and self.msg == other.msg:
85 return True
86 else:
87 return False
90@dataclass
91class ConfigIR:
92 """Intermediate representation of configurations read from a pipeline yaml
93 file.
94 """
95 python: Union[str, None] = None
96 """A string of python code that is used to modify a configuration. This can
97 also be None if there are no modifications to do.
98 """
99 dataId: Union[dict, None] = None
100 """A dataId that is used to constrain these config overrides to only quanta
101 with matching dataIds. This field can be None if there is no constraint.
102 This is currently an unimplemented feature, and is placed here for future
103 use.
104 """
105 file: List[str] = field(default_factory=list)
106 """A list of paths which points to a file containing config overrides to be
107 applied. This value may be an empty list if there are no overrides to
108 apply.
109 """
110 rest: dict = field(default_factory=dict)
111 """This is a dictionary of key value pairs, where the keys are strings
112 corresponding to qualified fields on a config to override, and the values
113 are strings representing the values to apply.
114 """
116 def to_primitives(self) -> dict:
117 """Convert to a representation used in yaml serialization
118 """
119 accumulate = {}
120 for name in ("python", "dataId", "file"):
121 # if this attribute is thruthy add it to the accumulation
122 # dictionary
123 if getattr(self, name):
124 accumulate[name] = getattr(self, name)
125 # Add the dictionary containing the rest of the config keys to the
126 # # accumulated dictionary
127 accumulate.update(self.rest)
128 return accumulate
130 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]:
131 """Merges another instance of a `ConfigIR` into this instance if
132 possible. This function returns a generator that is either self
133 if the configs were merged, or self, and other_config if that could
134 not be merged.
136 Parameters
137 ----------
138 other_config : `ConfigIR`
139 An instance of `ConfigIR` to merge into this instance.
141 Returns
142 -------
143 Generator : `ConfigIR`
144 A generator containing either self, or self and other_config if
145 the configs could be merged or not respectively.
146 """
147 # Verify that the config blocks can be merged
148 if self.dataId != other_config.dataId or self.python or other_config.python or\
149 self.file or other_config.file:
150 yield from (self, other_config)
151 return
153 # create a set of all keys, and verify two keys do not have different
154 # values
155 key_union = self.rest.keys() & other_config.rest.keys()
156 for key in key_union:
157 if self.rest[key] != other_config.rest[key]:
158 yield from (self, other_config)
159 return
160 self.rest.update(other_config.rest)
162 # Combine the lists of override files to load
163 self_file_set = set(self.file)
164 other_file_set = set(other_config.file)
165 self.file = list(self_file_set.union(other_file_set))
167 yield self
169 def __eq__(self, other: "ConfigIR"):
170 if not isinstance(other, ConfigIR):
171 return False
172 elif all(getattr(self, attr) == getattr(other, attr) for attr in
173 ("python", "dataId", "file", "rest")):
174 return True
175 else:
176 return False
179@dataclass
180class TaskIR:
181 """Intermediate representation of tasks read from a pipeline yaml file.
182 """
183 label: str
184 """An identifier used to refer to a task.
185 """
186 klass: str
187 """A string containing a fully qualified python class to be run in a
188 pipeline.
189 """
190 config: Union[List[ConfigIR], None] = None
191 """List of all configs overrides associated with this task, and may be
192 `None` if there are no config overrides.
193 """
195 def to_primitives(self) -> dict:
196 """Convert to a representation used in yaml serialization
197 """
198 accumulate = {'class': self.klass}
199 if self.config:
200 accumulate['config'] = [c.to_primitives() for c in self.config]
201 return accumulate
203 def add_or_update_config(self, other_config: ConfigIR):
204 """Adds a `ConfigIR` to this task if one is not present. Merges configs
205 if there is a `ConfigIR` present and the dataId keys of both configs
206 match, otherwise adds a new entry to the config list. The exception to
207 the above is that if either the last config or other_config has a
208 python block, then other_config is always added, as python blocks can
209 modify configs in ways that cannot be predicted.
211 Parameters
212 ----------
213 other_config : `ConfigIR`
214 A `ConfigIR` instance to add or merge into the config attribute of
215 this task.
216 """
217 if not self.config:
218 self.config = [other_config]
219 return
220 self.config.extend(self.config.pop().maybe_merge(other_config))
222 def __eq__(self, other: "TaskIR"):
223 if not isinstance(other, TaskIR):
224 return False
225 elif all(getattr(self, attr) == getattr(other, attr) for attr in
226 ("label", "klass", "config")):
227 return True
228 else:
229 return False
232@dataclass
233class InheritIR:
234 """An intermediate representation of inherited pipelines
235 """
236 location: str
237 """This is the location of the pipeline to inherit. The path should be
238 specified as an absolute path. Environment variables may be used in the
239 path and should be specified as a python string template, with the name of
240 the environment variable inside braces.
241 """
242 include: Union[List[str], None] = None
243 """List of tasks that should be included when inheriting this pipeline.
244 Either the include or exclude attributes may be specified, but not both.
245 """
246 exclude: Union[List[str], None] = None
247 """List of tasks that should be excluded when inheriting this pipeline.
248 Either the include or exclude attributes may be specified, but not both.
249 """
250 importContracts: bool = True
251 """Boolean attribute to dictate if contracts should be inherited with the
252 pipeline or not.
253 """
255 def toPipelineIR(self) -> "PipelineIR":
256 """Convert to a representation used in yaml serialization
257 """
258 if self.include and self.exclude:
259 raise ValueError("Both an include and an exclude list cant be specified"
260 " when declaring a pipeline import")
261 tmp_pipeline = PipelineIR.from_file(os.path.expandvars(self.location))
262 if tmp_pipeline.instrument is not None:
263 warnings.warn("Any instrument definitions in imported pipelines are ignored. "
264 "if an instrument is desired please define it in the top most pipeline")
266 new_tasks = {}
267 for label, task in tmp_pipeline.tasks.items():
268 if (self.include and label in self.include) or (self.exclude and label not in self.exclude)\
269 or (self.include is None and self.exclude is None):
270 new_tasks[label] = task
271 tmp_pipeline.tasks = new_tasks
273 if not self.importContracts:
274 tmp_pipeline.contracts = []
276 return tmp_pipeline
278 def __eq__(self, other: "InheritIR"):
279 if not isinstance(other, InheritIR):
280 return False
281 elif all(getattr(self, attr) == getattr(other, attr) for attr in
282 ("location", "include", "exclude", "importContracts")):
283 return True
284 else:
285 return False
288class PipelineIR:
289 """Intermediate representation of a pipeline definition
291 Parameters
292 ----------
293 loaded_yaml : `dict`
294 A dictionary which matches the structure that would be produced by a
295 yaml reader which parses a pipeline definition document
297 Raises
298 ------
299 ValueError :
300 - If a pipeline is declared without a description
301 - If no tasks are declared in a pipeline, and no pipelines are to be
302 inherited
303 - If more than one instrument is specified
304 - If more than one inherited pipeline share a label
305 """
306 def __init__(self, loaded_yaml):
307 # Check required fields are present
308 if "description" not in loaded_yaml:
309 raise ValueError("A pipeline must be declared with a description")
310 if "tasks" not in loaded_yaml and "inherits" not in loaded_yaml:
311 raise ValueError("A pipeline must be declared with one or more tasks")
313 # Process pipeline description
314 self.description = loaded_yaml.pop("description")
316 # Process tasks
317 self._read_tasks(loaded_yaml)
319 # Process instrument keys
320 inst = loaded_yaml.pop("instrument", None)
321 if isinstance(inst, list):
322 raise ValueError("Only one top level instrument can be defined in a pipeline")
323 self.instrument = inst
325 # Process any contracts
326 self._read_contracts(loaded_yaml)
328 # Process any inherited pipelines
329 self._read_inherits(loaded_yaml)
331 def _read_contracts(self, loaded_yaml):
332 """Process the contracts portion of the loaded yaml document
334 Parameters
335 ---------
336 loaded_yaml : `dict`
337 A dictionary which matches the structure that would be produced by
338 a yaml reader which parses a pipeline definition document
339 """
340 loaded_contracts = loaded_yaml.pop("contracts", [])
341 if isinstance(loaded_contracts, str):
342 loaded_contracts = [loaded_contracts]
343 self.contracts = []
344 for contract in loaded_contracts:
345 if isinstance(contract, dict):
346 self.contracts.append(ContractIR(**contract))
347 if isinstance(contract, str):
348 self.contracts.append(ContractIR(contract=contract))
350 def _read_inherits(self, loaded_yaml):
351 """Process the inherits portion of the loaded yaml document
353 Parameters
354 ---------
355 loaded_yaml : `dict`
356 A dictionary which matches the structure that would be produced by
357 a yaml reader which parses a pipeline definition document
358 """
359 def process_args(argument: Union[str, dict]) -> dict:
360 if isinstance(argument, str):
361 return {"location": argument}
362 elif isinstance(argument, dict):
363 if "exclude" in argument and isinstance(argument["exclude"], str):
364 argument["exclude"] = [argument["exclude"]]
365 if "include" in argument and isinstance(argument["include"], str):
366 argument["include"] = [argument["include"]]
367 return argument
368 tmp_inherit = loaded_yaml.pop("inherits", None)
369 if tmp_inherit is None:
370 self.inherits = []
371 elif isinstance(tmp_inherit, list):
372 self.inherits = [InheritIR(**process_args(args)) for args in tmp_inherit]
373 else:
374 self.inherits = [InheritIR(**process_args(tmp_inherit))]
376 # integrate any imported pipelines
377 accumulate_tasks = {}
378 for other_pipeline in self.inherits:
379 tmp_IR = other_pipeline.toPipelineIR()
380 if accumulate_tasks.keys() & tmp_IR.tasks.keys():
381 raise ValueError("Task labels in the imported pipelines must "
382 "be unique")
383 accumulate_tasks.update(tmp_IR.tasks)
384 self.contracts.extend(tmp_IR.contracts)
386 # merge the dict of label:TaskIR objects, preserving any configs in the
387 # imported pipeline if the labels point to the same class
388 for label, task in self.tasks.items():
389 if label not in accumulate_tasks:
390 accumulate_tasks[label] = task
391 elif accumulate_tasks[label].klass == task.klass:
392 if task.config is not None:
393 for config in task.config:
394 accumulate_tasks[label].add_or_update_config(config)
395 else:
396 accumulate_tasks[label] = task
397 self.tasks = accumulate_tasks
399 def _read_tasks(self, loaded_yaml):
400 """Process the tasks portion of the loaded yaml document
402 Parameters
403 ---------
404 loaded_yaml : `dict`
405 A dictionary which matches the structure that would be produced by
406 a yaml reader which parses a pipeline definition document
407 """
408 self.tasks = {}
409 tmp_tasks = loaded_yaml.pop("tasks", None)
410 if tmp_tasks is None:
411 tmp_tasks = {}
413 for label, definition in tmp_tasks.items():
414 if isinstance(definition, str):
415 definition = {"class": definition}
416 config = definition.get('config', None)
417 if config is None:
418 task_config_ir = None
419 else:
420 if isinstance(config, dict):
421 config = [config]
422 task_config_ir = []
423 for c in config:
424 file = c.pop("file", None)
425 if file is None:
426 file = []
427 elif not isinstance(file, list):
428 file = [file]
429 task_config_ir.append(ConfigIR(python=c.pop("python", None),
430 dataId=c.pop("dataId", None),
431 file=file,
432 rest=c))
433 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir)
435 @classmethod
436 def from_string(cls, pipeline_string: str):
437 """Create a `PipelineIR` object from a string formatted like a pipeline
438 document
440 Parameters
441 ----------
442 pipeline_string : `str`
443 A string that is formatted according like a pipeline document
444 """
445 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader)
446 return cls(loaded_yaml)
448 @classmethod
449 def from_file(cls, filename: str):
450 """Create a `PipelineIR` object from the document specified by the
451 input path.
453 Parameters
454 ----------
455 filename : `str`
456 Location of document to use in creating a `PipelineIR` object.
457 """
458 with open(filename, 'r') as f:
459 loaded_yaml = yaml.load(f, Loader=PipelineYamlLoader)
460 return cls(loaded_yaml)
462 def to_file(self, filename: str):
463 """Serialize this `PipelineIR` object into a yaml formatted string and
464 write the output to a file at the specified path.
466 Parameters
467 ----------
468 filename : `str`
469 Location of document to write a `PipelineIR` object.
470 """
471 with open(filename, 'w') as f:
472 yaml.dump(self.to_primitives(), f, sort_keys=False)
474 def to_primitives(self):
475 """Convert to a representation used in yaml serialization
476 """
477 accumulate = {"description": self.description}
478 if self.instrument is not None:
479 accumulate['instrument'] = self.instrument
480 accumulate['tasks'] = {m: t.to_primitives() for m, t in self.tasks.items()}
481 if len(self.contracts) > 0:
482 accumulate['contracts'] = [c.to_primitives() for c in self.contracts]
483 return accumulate
485 def __str__(self) -> str:
486 """Instance formatting as how it would look in yaml representation
487 """
488 return yaml.dump(self.to_primitives(), sort_keys=False)
490 def __repr__(self) -> str:
491 """Instance formatting as how it would look in yaml representation
492 """
493 return str(self)
495 def __eq__(self, other: "PipelineIR"):
496 if not isinstance(other, PipelineIR):
497 return False
498 elif all(getattr(self, attr) == getattr(other, attr) for attr in
499 ("contracts", "tasks", "instrument")):
500 return True
501 else:
502 return False