lsst.pipe.base  20.0.0-21-gb65c2a3+a8acb598c9
pipelineIR.py
Go to the documentation of this file.
1 __all__ = ("ConfigIR", "ContractError", "ContractIR", "InheritIR", "PipelineIR", "TaskIR")
2 # This file is part of pipe_base.
3 #
4 # Developed for the LSST Data Management System.
5 # This product includes software developed by the LSST Project
6 # (http://www.lsst.org).
7 # See the COPYRIGHT file at the top-level directory of this distribution
8 # for details of code ownership.
9 #
10 # This program is free software: you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation, either version 3 of the License, or
13 # (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the GNU General Public License
21 # along with this program. If not, see <http://www.gnu.org/licenses/>.
22 
23 from collections import Counter
24 from dataclasses import dataclass, field
25 from typing import List, Union, Generator
26 
27 import os
28 import yaml
29 import warnings
30 
31 
32 class PipelineYamlLoader(yaml.SafeLoader):
33  """This is a specialized version of yaml's SafeLoader. It checks and raises
34  an exception if it finds that there are multiple instances of the same key
35  found inside a pipeline file at a given scope.
36  """
37  def construct_mapping(self, node, deep=False):
38  # do the call to super first so that it can do all the other forms of
39  # checking on this node. If you check the uniqueness of keys first
40  # it would save the work that super does in the case of a failure, but
41  # it might fail in the case that the node was the incorrect node due
42  # to a parsing error, and the resulting exception would be difficult to
43  # understand.
44  mapping = super().construct_mapping(node, deep)
45  # Check if there are any duplicate keys
46  all_keys = Counter(key_node.value for key_node, _ in node.value)
47  duplicates = {k for k, i in all_keys.items() if i != 1}
48  if duplicates:
49  raise KeyError("Pipeline files must not have duplicated keys, "
50  f"{duplicates} appeared multiple times")
51  return mapping
52 
53 
54 class ContractError(Exception):
55  """An exception that is raised when a pipeline contract is not satisfied
56  """
57  pass
58 
59 
60 @dataclass
61 class ContractIR:
62  """Intermediate representation of contracts read from a pipeline yaml file.
63  """
64  contract: str
65  """A string of python code representing one or more conditions on configs
66  in a pipeline. This code-as-string should, once evaluated, should be True
67  if the configs are fine, and False otherwise.
68  """
69  msg: Union[str, None] = None
70  """An optional message to be shown to the user if a contract fails
71  """
72 
73  def to_primitives(self) -> dict:
74  """Convert to a representation used in yaml serialization
75  """
76  accumulate = {"contract": self.contract}
77  if self.msg is not None:
78  accumulate['msg'] = self.msg
79  return accumulate
80 
81  def __eq__(self, other: "ContractIR"):
82  if not isinstance(other, ContractIR):
83  return False
84  elif self.contract == other.contract and self.msg == other.msg:
85  return True
86  else:
87  return False
88 
89 
90 @dataclass
91 class ConfigIR:
92  """Intermediate representation of configurations read from a pipeline yaml
93  file.
94  """
95  python: Union[str, None] = None
96  """A string of python code that is used to modify a configuration. This can
97  also be None if there are no modifications to do.
98  """
99  dataId: Union[dict, None] = None
100  """A dataId that is used to constrain these config overrides to only quanta
101  with matching dataIds. This field can be None if there is no constraint.
102  This is currently an unimplemented feature, and is placed here for future
103  use.
104  """
105  file: List[str] = field(default_factory=list)
106  """A list of paths which points to a file containing config overrides to be
107  applied. This value may be an empty list if there are no overrides to
108  apply.
109  """
110  rest: dict = field(default_factory=dict)
111  """This is a dictionary of key value pairs, where the keys are strings
112  corresponding to qualified fields on a config to override, and the values
113  are strings representing the values to apply.
114  """
115 
116  def to_primitives(self) -> dict:
117  """Convert to a representation used in yaml serialization
118  """
119  accumulate = {}
120  for name in ("python", "dataId", "file"):
121  # if this attribute is thruthy add it to the accumulation
122  # dictionary
123  if getattr(self, name):
124  accumulate[name] = getattr(self, name)
125  # Add the dictionary containing the rest of the config keys to the
126  # # accumulated dictionary
127  accumulate.update(self.rest)
128  return accumulate
129 
130  def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]:
131  """Merges another instance of a `ConfigIR` into this instance if
132  possible. This function returns a generator that is either self
133  if the configs were merged, or self, and other_config if that could
134  not be merged.
135 
136  Parameters
137  ----------
138  other_config : `ConfigIR`
139  An instance of `ConfigIR` to merge into this instance.
140 
141  Returns
142  -------
143  Generator : `ConfigIR`
144  A generator containing either self, or self and other_config if
145  the configs could be merged or not respectively.
146  """
147  # Verify that the config blocks can be merged
148  if self.dataId != other_config.dataId or self.python or other_config.python or\
149  self.file or other_config.file:
150  yield from (self, other_config)
151  return
152 
153  # create a set of all keys, and verify two keys do not have different
154  # values
155  key_union = self.rest.keys() & other_config.rest.keys()
156  for key in key_union:
157  if self.rest[key] != other_config.rest[key]:
158  yield from (self, other_config)
159  return
160  self.rest.update(other_config.rest)
161 
162  # Combine the lists of override files to load
163  self_file_set = set(self.file)
164  other_file_set = set(other_config.file)
165  self.file = list(self_file_set.union(other_file_set))
166 
167  yield self
168 
169  def __eq__(self, other: "ConfigIR"):
170  if not isinstance(other, ConfigIR):
171  return False
172  elif all(getattr(self, attr) == getattr(other, attr) for attr in
173  ("python", "dataId", "file", "rest")):
174  return True
175  else:
176  return False
177 
178 
179 @dataclass
180 class TaskIR:
181  """Intermediate representation of tasks read from a pipeline yaml file.
182  """
183  label: str
184  """An identifier used to refer to a task.
185  """
186  klass: str
187  """A string containing a fully qualified python class to be run in a
188  pipeline.
189  """
190  config: Union[List[ConfigIR], None] = None
191  """List of all configs overrides associated with this task, and may be
192  `None` if there are no config overrides.
193  """
194 
195  def to_primitives(self) -> dict:
196  """Convert to a representation used in yaml serialization
197  """
198  accumulate = {'class': self.klass}
199  if self.config:
200  accumulate['config'] = [c.to_primitives() for c in self.config]
201  return accumulate
202 
203  def add_or_update_config(self, other_config: ConfigIR):
204  """Adds a `ConfigIR` to this task if one is not present. Merges configs
205  if there is a `ConfigIR` present and the dataId keys of both configs
206  match, otherwise adds a new entry to the config list. The exception to
207  the above is that if either the last config or other_config has a
208  python block, then other_config is always added, as python blocks can
209  modify configs in ways that cannot be predicted.
210 
211  Parameters
212  ----------
213  other_config : `ConfigIR`
214  A `ConfigIR` instance to add or merge into the config attribute of
215  this task.
216  """
217  if not self.config:
218  self.config = [other_config]
219  return
220  self.config.extend(self.config.pop().maybe_merge(other_config))
221 
222  def __eq__(self, other: "TaskIR"):
223  if not isinstance(other, TaskIR):
224  return False
225  elif all(getattr(self, attr) == getattr(other, attr) for attr in
226  ("label", "klass", "config")):
227  return True
228  else:
229  return False
230 
231 
232 @dataclass
233 class InheritIR:
234  """An intermediate representation of inherited pipelines
235  """
236  location: str
237  """This is the location of the pipeline to inherit. The path should be
238  specified as an absolute path. Environment variables may be used in the
239  path and should be specified as a python string template, with the name of
240  the environment variable inside braces.
241  """
242  include: Union[List[str], None] = None
243  """List of tasks that should be included when inheriting this pipeline.
244  Either the include or exclude attributes may be specified, but not both.
245  """
246  exclude: Union[List[str], None] = None
247  """List of tasks that should be excluded when inheriting this pipeline.
248  Either the include or exclude attributes may be specified, but not both.
249  """
250  importContracts: bool = True
251  """Boolean attribute to dictate if contracts should be inherited with the
252  pipeline or not.
253  """
254 
255  def toPipelineIR(self) -> "PipelineIR":
256  """Convert to a representation used in yaml serialization
257  """
258  if self.include and self.exclude:
259  raise ValueError("Both an include and an exclude list cant be specified"
260  " when declaring a pipeline import")
261  tmp_pipeline = PipelineIR.from_file(os.path.expandvars(self.location))
262  if tmp_pipeline.instrument is not None:
263  warnings.warn("Any instrument definitions in imported pipelines are ignored. "
264  "if an instrument is desired please define it in the top most pipeline")
265 
266  new_tasks = {}
267  for label, task in tmp_pipeline.tasks.items():
268  if (self.include and label in self.include) or (self.exclude and label not in self.exclude)\
269  or (self.include is None and self.exclude is None):
270  new_tasks[label] = task
271  tmp_pipeline.tasks = new_tasks
272 
273  if not self.importContracts:
274  tmp_pipeline.contracts = []
275 
276  return tmp_pipeline
277 
278  def __eq__(self, other: "InheritIR"):
279  if not isinstance(other, InheritIR):
280  return False
281  elif all(getattr(self, attr) == getattr(other, attr) for attr in
282  ("location", "include", "exclude", "importContracts")):
283  return True
284  else:
285  return False
286 
287 
289  """Intermediate representation of a pipeline definition
290 
291  Parameters
292  ----------
293  loaded_yaml : `dict`
294  A dictionary which matches the structure that would be produced by a
295  yaml reader which parses a pipeline definition document
296 
297  Raises
298  ------
299  ValueError :
300  - If a pipeline is declared without a description
301  - If no tasks are declared in a pipeline, and no pipelines are to be
302  inherited
303  - If more than one instrument is specified
304  - If more than one inherited pipeline share a label
305  """
306  def __init__(self, loaded_yaml):
307  # Check required fields are present
308  if "description" not in loaded_yaml:
309  raise ValueError("A pipeline must be declared with a description")
310  if "tasks" not in loaded_yaml and "inherits" not in loaded_yaml:
311  raise ValueError("A pipeline must be declared with one or more tasks")
312 
313  # Process pipeline description
314  self.description = loaded_yaml.pop("description")
315 
316  # Process tasks
317  self._read_tasks(loaded_yaml)
318 
319  # Process instrument keys
320  inst = loaded_yaml.pop("instrument", None)
321  if isinstance(inst, list):
322  raise ValueError("Only one top level instrument can be defined in a pipeline")
323  self.instrument = inst
324 
325  # Process any contracts
326  self._read_contracts(loaded_yaml)
327 
328  # Process any inherited pipelines
329  self._read_inherits(loaded_yaml)
330 
331  def _read_contracts(self, loaded_yaml):
332  """Process the contracts portion of the loaded yaml document
333 
334  Parameters
335  ---------
336  loaded_yaml : `dict`
337  A dictionary which matches the structure that would be produced by
338  a yaml reader which parses a pipeline definition document
339  """
340  loaded_contracts = loaded_yaml.pop("contracts", [])
341  if isinstance(loaded_contracts, str):
342  loaded_contracts = [loaded_contracts]
343  self.contracts = []
344  for contract in loaded_contracts:
345  if isinstance(contract, dict):
346  self.contracts.append(ContractIR(**contract))
347  if isinstance(contract, str):
348  self.contracts.append(ContractIR(contract=contract))
349 
350  def _read_inherits(self, loaded_yaml):
351  """Process the inherits portion of the loaded yaml document
352 
353  Parameters
354  ---------
355  loaded_yaml : `dict`
356  A dictionary which matches the structure that would be produced by
357  a yaml reader which parses a pipeline definition document
358  """
359  def process_args(argument: Union[str, dict]) -> dict:
360  if isinstance(argument, str):
361  return {"location": argument}
362  elif isinstance(argument, dict):
363  if "exclude" in argument and isinstance(argument["exclude"], str):
364  argument["exclude"] = [argument["exclude"]]
365  if "include" in argument and isinstance(argument["include"], str):
366  argument["include"] = [argument["include"]]
367  return argument
368  tmp_inherit = loaded_yaml.pop("inherits", None)
369  if tmp_inherit is None:
370  self.inherits = []
371  elif isinstance(tmp_inherit, list):
372  self.inherits = [InheritIR(**process_args(args)) for args in tmp_inherit]
373  else:
374  self.inherits = [InheritIR(**process_args(tmp_inherit))]
375 
376  # integrate any imported pipelines
377  accumulate_tasks = {}
378  for other_pipeline in self.inherits:
379  tmp_IR = other_pipeline.toPipelineIR()
380  if accumulate_tasks.keys() & tmp_IR.tasks.keys():
381  raise ValueError("Task labels in the imported pipelines must "
382  "be unique")
383  accumulate_tasks.update(tmp_IR.tasks)
384  self.contracts.extend(tmp_IR.contracts)
385 
386  # merge the dict of label:TaskIR objects, preserving any configs in the
387  # imported pipeline if the labels point to the same class
388  for label, task in self.tasks.items():
389  if label not in accumulate_tasks:
390  accumulate_tasks[label] = task
391  elif accumulate_tasks[label].klass == task.klass:
392  if task.config is not None:
393  for config in task.config:
394  accumulate_tasks[label].add_or_update_config(config)
395  else:
396  accumulate_tasks[label] = task
397  self.tasks = accumulate_tasks
398 
399  def _read_tasks(self, loaded_yaml):
400  """Process the tasks portion of the loaded yaml document
401 
402  Parameters
403  ---------
404  loaded_yaml : `dict`
405  A dictionary which matches the structure that would be produced by
406  a yaml reader which parses a pipeline definition document
407  """
408  self.tasks = {}
409  tmp_tasks = loaded_yaml.pop("tasks", None)
410  if tmp_tasks is None:
411  tmp_tasks = {}
412 
413  for label, definition in tmp_tasks.items():
414  if isinstance(definition, str):
415  definition = {"class": definition}
416  config = definition.get('config', None)
417  if config is None:
418  task_config_ir = None
419  else:
420  if isinstance(config, dict):
421  config = [config]
422  task_config_ir = []
423  for c in config:
424  file = c.pop("file", None)
425  if file is None:
426  file = []
427  elif not isinstance(file, list):
428  file = [file]
429  task_config_ir.append(ConfigIR(python=c.pop("python", None),
430  dataId=c.pop("dataId", None),
431  file=file,
432  rest=c))
433  self.tasks[label] = TaskIR(label, definition["class"], task_config_ir)
434 
435  @classmethod
436  def from_string(cls, pipeline_string: str):
437  """Create a `PipelineIR` object from a string formatted like a pipeline
438  document
439 
440  Parameters
441  ----------
442  pipeline_string : `str`
443  A string that is formatted according like a pipeline document
444  """
445  loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader)
446  return cls(loaded_yaml)
447 
448  @classmethod
449  def from_file(cls, filename: str):
450  """Create a `PipelineIR` object from the document specified by the
451  input path.
452 
453  Parameters
454  ----------
455  filename : `str`
456  Location of document to use in creating a `PipelineIR` object.
457  """
458  with open(filename, 'r') as f:
459  loaded_yaml = yaml.load(f, Loader=PipelineYamlLoader)
460  return cls(loaded_yaml)
461 
462  def to_file(self, filename: str):
463  """Serialize this `PipelineIR` object into a yaml formatted string and
464  write the output to a file at the specified path.
465 
466  Parameters
467  ----------
468  filename : `str`
469  Location of document to write a `PipelineIR` object.
470  """
471  with open(filename, 'w') as f:
472  yaml.dump(self.to_primitives(), f, sort_keys=False)
473 
474  def to_primitives(self):
475  """Convert to a representation used in yaml serialization
476  """
477  accumulate = {"description": self.description}
478  if self.instrument is not None:
479  accumulate['instrument'] = self.instrument
480  accumulate['tasks'] = {m: t.to_primitives() for m, t in self.tasks.items()}
481  if len(self.contracts) > 0:
482  accumulate['contracts'] = [c.to_primitives() for c in self.contracts]
483  return accumulate
484 
485  def __str__(self) -> str:
486  """Instance formatting as how it would look in yaml representation
487  """
488  return yaml.dump(self.to_primitives(), sort_keys=False)
489 
490  def __repr__(self) -> str:
491  """Instance formatting as how it would look in yaml representation
492  """
493  return str(self)
494 
495  def __eq__(self, other: "PipelineIR"):
496  if not isinstance(other, PipelineIR):
497  return False
498  elif all(getattr(self, attr) == getattr(other, attr) for attr in
499  ("contracts", "tasks", "instrument")):
500  return True
501  else:
502  return False
lsst::pipe::base.pipelineIR.PipelineIR._read_contracts
def _read_contracts(self, loaded_yaml)
Definition: pipelineIR.py:331
lsst::pipe::base.pipelineIR.ContractIR.msg
msg
Definition: pipelineIR.py:84
lsst::pipe::base.pipelineIR.PipelineIR.__eq__
def __eq__(self, "PipelineIR" other)
Definition: pipelineIR.py:495
lsst::pipe::base.pipelineIR.PipelineIR.contracts
contracts
Definition: pipelineIR.py:343
lsst::pipe::base.pipelineIR.TaskIR.to_primitives
dict to_primitives(self)
Definition: pipelineIR.py:195
lsst::pipe::base.pipelineIR.TaskIR.config
config
Definition: pipelineIR.py:218
lsst::pipe::base.pipelineIR.ContractError
Definition: pipelineIR.py:54
lsst::pipe::base.pipelineIR.ConfigIR.file
file
Definition: pipelineIR.py:165
lsst::pipe::base.pipelineIR.PipelineIR.description
description
Definition: pipelineIR.py:314
lsst::pipe::base.pipelineIR.ConfigIR.to_primitives
dict to_primitives(self)
Definition: pipelineIR.py:116
lsst::pipe::base.pipelineIR.PipelineIR.__str__
str __str__(self)
Definition: pipelineIR.py:485
lsst::pipe::base.pipelineIR.PipelineIR.inherits
inherits
Definition: pipelineIR.py:370
lsst::pipe::base.pipelineIR.PipelineIR
Definition: pipelineIR.py:288
lsst::pipe::base.pipelineIR.PipelineIR.__repr__
str __repr__(self)
Definition: pipelineIR.py:490
lsst::pipe::base.pipelineIR.PipelineYamlLoader
Definition: pipelineIR.py:32
lsst::pipe::base.pipelineIR.PipelineIR.from_file
def from_file(cls, str filename)
Definition: pipelineIR.py:449
lsst::pipe::base.pipelineIR.TaskIR
Definition: pipelineIR.py:180
lsst::pipe::base.pipelineIR.ContractIR.__eq__
def __eq__(self, "ContractIR" other)
Definition: pipelineIR.py:81
lsst::pipe::base.pipelineIR.PipelineIR._read_tasks
def _read_tasks(self, loaded_yaml)
Definition: pipelineIR.py:399
lsst::pipe::base.pipelineIR.TaskIR.add_or_update_config
def add_or_update_config(self, ConfigIR other_config)
Definition: pipelineIR.py:203
lsst::pipe::base.pipelineIR.ConfigIR.maybe_merge
Generator["ConfigIR", None, None] maybe_merge(self, "ConfigIR" other_config)
Definition: pipelineIR.py:130
lsst::pipe::base.pipelineIR.PipelineIR.tasks
tasks
Definition: pipelineIR.py:397
lsst::pipe::base.pipelineIR.ContractIR.to_primitives
dict to_primitives(self)
Definition: pipelineIR.py:73
lsst::pipe::base.pipelineIR.InheritIR.__eq__
def __eq__(self, "InheritIR" other)
Definition: pipelineIR.py:278
lsst::pipe::base.pipelineIR.ConfigIR
Definition: pipelineIR.py:91
lsst::pipe::base.pipelineIR.PipelineIR.from_string
def from_string(cls, str pipeline_string)
Definition: pipelineIR.py:436
lsst::pipe::base.pipelineIR.PipelineIR.__init__
def __init__(self, loaded_yaml)
Definition: pipelineIR.py:306
lsst::pipe::base.pipelineIR.PipelineIR.to_file
def to_file(self, str filename)
Definition: pipelineIR.py:462
lsst::pipe::base.pipelineIR.PipelineIR.to_primitives
def to_primitives(self)
Definition: pipelineIR.py:474
lsst::pipe::base.pipelineIR.InheritIR.toPipelineIR
"PipelineIR" toPipelineIR(self)
Definition: pipelineIR.py:255
lsst::pipe::base.pipelineIR.ContractIR
Definition: pipelineIR.py:61
lsst::pipe::base.pipelineIR.PipelineYamlLoader.construct_mapping
def construct_mapping(self, node, deep=False)
Definition: pipelineIR.py:37
lsst::pipe::base.pipelineIR.PipelineIR._read_inherits
def _read_inherits(self, loaded_yaml)
Definition: pipelineIR.py:350
lsst::pipe::base.pipelineIR.ContractIR.contract
contract
Definition: pipelineIR.py:84
lsst::pipe::base.pipelineIR.InheritIR
Definition: pipelineIR.py:233
lsst::pipe::base.pipelineIR.PipelineIR.instrument
instrument
Definition: pipelineIR.py:323
lsst::pipe::base.pipelineIR.TaskIR.__eq__
def __eq__(self, "TaskIR" other)
Definition: pipelineIR.py:222
lsst::pipe::base.pipelineIR.ConfigIR.__eq__
def __eq__(self, "ConfigIR" other)
Definition: pipelineIR.py:169