Coverage for python/lsst/pipe/base/pipelineIR.py: 19%
407 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-30 02:02 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-30 02:02 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("ConfigIR", "ContractError", "ContractIR", "ImportIR", "PipelineIR", "TaskIR", "LabeledSubset")
25import copy
26import enum
27import os
28import re
29import warnings
30from collections import Counter
31from collections.abc import Iterable as abcIterable
32from dataclasses import dataclass, field
33from typing import Any, Dict, Generator, List, Literal, Mapping, MutableMapping, Optional, Set, Union
35import yaml
36from lsst.resources import ResourcePath, ResourcePathExpression
39class _Tags(enum.Enum):
40 KeepInstrument = enum.auto()
43class PipelineYamlLoader(yaml.SafeLoader):
44 """This is a specialized version of yaml's SafeLoader. It checks and raises
45 an exception if it finds that there are multiple instances of the same key
46 found inside a pipeline file at a given scope.
47 """
49 def construct_mapping(self, node: yaml.Node, deep: bool = False) -> Mapping[str, Any]:
50 # do the call to super first so that it can do all the other forms of
51 # checking on this node. If you check the uniqueness of keys first
52 # it would save the work that super does in the case of a failure, but
53 # it might fail in the case that the node was the incorrect node due
54 # to a parsing error, and the resulting exception would be difficult to
55 # understand.
56 mapping = super().construct_mapping(node, deep)
57 # Check if there are any duplicate keys
58 all_keys = Counter(key_node.value for key_node, _ in node.value)
59 duplicates = {k for k, i in all_keys.items() if i != 1}
60 if duplicates:
61 raise KeyError(
62 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times"
63 )
64 return mapping
67class MultilineStringDumper(yaml.Dumper):
68 """Custom YAML dumper that makes multi-line strings use the '|'
69 continuation style instead of unreadable newlines and tons of quotes.
71 Basic approach is taken from
72 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data,
73 but is written as a Dumper subclass to make its effects non-global (vs
74 `yaml.add_representer`).
75 """
77 def represent_scalar(self, tag: str, value: Any, style: Optional[str] = None) -> yaml.ScalarNode:
78 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1:
79 style = "|"
80 return super().represent_scalar(tag, value, style)
83class ContractError(Exception):
84 """An exception that is raised when a pipeline contract is not satisfied"""
86 pass
89@dataclass
90class ContractIR:
91 """Intermediate representation of configuration contracts read from a
92 pipeline yaml file."""
94 contract: str
95 """A string of python code representing one or more conditions on configs
96 in a pipeline. This code-as-string should, once evaluated, should be True
97 if the configs are fine, and False otherwise.
98 """
99 msg: Union[str, None] = None
100 """An optional message to be shown to the user if a contract fails
101 """
103 def to_primitives(self) -> Dict[str, str]:
104 """Convert to a representation used in yaml serialization"""
105 accumulate = {"contract": self.contract}
106 if self.msg is not None:
107 accumulate["msg"] = self.msg
108 return accumulate
110 def __eq__(self, other: object) -> bool:
111 if not isinstance(other, ContractIR):
112 return False
113 elif self.contract == other.contract and self.msg == other.msg:
114 return True
115 else:
116 return False
119@dataclass
120class LabeledSubset:
121 """Intermediate representation of named subset of task labels read from
122 a pipeline yaml file.
123 """
125 label: str
126 """The label used to identify the subset of task labels.
127 """
128 subset: Set[str]
129 """A set of task labels contained in this subset.
130 """
131 description: Optional[str]
132 """A description of what this subset of tasks is intended to do
133 """
135 @staticmethod
136 def from_primitives(label: str, value: Union[List[str], dict]) -> LabeledSubset:
137 """Generate `LabeledSubset` objects given a properly formatted object
138 that as been created by a yaml loader.
140 Parameters
141 ----------
142 label : `str`
143 The label that will be used to identify this labeled subset.
144 value : `list` of `str` or `dict`
145 Object returned from loading a labeled subset section from a yaml
146 document.
148 Returns
149 -------
150 labeledSubset : `LabeledSubset`
151 A `LabeledSubset` object build from the inputs.
153 Raises
154 ------
155 ValueError
156 Raised if the value input is not properly formatted for parsing
157 """
158 if isinstance(value, MutableMapping):
159 subset = value.pop("subset", None)
160 if subset is None:
161 raise ValueError(
162 "If a labeled subset is specified as a mapping, it must contain the key 'subset'"
163 )
164 description = value.pop("description", None)
165 elif isinstance(value, abcIterable):
166 subset = value
167 description = None
168 else:
169 raise ValueError(
170 f"There was a problem parsing the labeled subset {label}, make sure the "
171 "definition is either a valid yaml list, or a mapping with keys "
172 "(subset, description) where subset points to a yaml list, and description is "
173 "associated with a string"
174 )
175 return LabeledSubset(label, set(subset), description)
177 def to_primitives(self) -> Dict[str, Union[List[str], str]]:
178 """Convert to a representation used in yaml serialization"""
179 accumulate: Dict[str, Union[List[str], str]] = {"subset": list(self.subset)}
180 if self.description is not None:
181 accumulate["description"] = self.description
182 return accumulate
185@dataclass
186class ParametersIR:
187 """Intermediate representation of parameters that are global to a pipeline
189 These parameters are specified under a top level key named `parameters`
190 and are declared as a yaml mapping. These entries can then be used inside
191 task configuration blocks to specify configuration values. They may not be
192 used in the special ``file`` or ``python`` blocks.
194 Example:
195 paramters:
196 shared_value: 14
197 tasks:
198 taskA:
199 class: modA
200 config:
201 field1: parameters.shared_value
202 taskB:
203 class: modB
204 config:
205 field2: parameters.shared_value
206 """
208 mapping: MutableMapping[str, str]
209 """A mutable mapping of identifiers as keys, and shared configuration
210 as values.
211 """
213 def update(self, other: Optional[ParametersIR]) -> None:
214 if other is not None:
215 self.mapping.update(other.mapping)
217 def to_primitives(self) -> MutableMapping[str, str]:
218 """Convert to a representation used in yaml serialization"""
219 return self.mapping
221 def __contains__(self, value: str) -> bool:
222 return value in self.mapping
224 def __getitem__(self, item: str) -> Any:
225 return self.mapping[item]
227 def __bool__(self) -> bool:
228 return bool(self.mapping)
231@dataclass
232class ConfigIR:
233 """Intermediate representation of configurations read from a pipeline yaml
234 file.
235 """
237 python: Union[str, None] = None
238 """A string of python code that is used to modify a configuration. This can
239 also be None if there are no modifications to do.
240 """
241 dataId: Union[dict, None] = None
242 """A dataId that is used to constrain these config overrides to only quanta
243 with matching dataIds. This field can be None if there is no constraint.
244 This is currently an unimplemented feature, and is placed here for future
245 use.
246 """
247 file: List[str] = field(default_factory=list)
248 """A list of paths which points to a file containing config overrides to be
249 applied. This value may be an empty list if there are no overrides to
250 apply.
251 """
252 rest: dict = field(default_factory=dict)
253 """This is a dictionary of key value pairs, where the keys are strings
254 corresponding to qualified fields on a config to override, and the values
255 are strings representing the values to apply.
256 """
258 def to_primitives(self) -> Dict[str, Union[str, dict, List[str]]]:
259 """Convert to a representation used in yaml serialization"""
260 accumulate = {}
261 for name in ("python", "dataId", "file"):
262 # if this attribute is thruthy add it to the accumulation
263 # dictionary
264 if getattr(self, name):
265 accumulate[name] = getattr(self, name)
266 # Add the dictionary containing the rest of the config keys to the
267 # # accumulated dictionary
268 accumulate.update(self.rest)
269 return accumulate
271 def formatted(self, parameters: ParametersIR) -> ConfigIR:
272 """Returns a new ConfigIR object that is formatted according to the
273 specified parameters
275 Parameters
276 ----------
277 parameters : ParametersIR
278 Object that contains variable mappings used in substitution.
280 Returns
281 -------
282 config : ConfigIR
283 A new ConfigIR object formatted with the input parameters
284 """
285 new_config = copy.deepcopy(self)
286 for key, value in new_config.rest.items():
287 if not isinstance(value, str):
288 continue
289 match = re.match("parameters[.](.*)", value)
290 if match and match.group(1) in parameters:
291 new_config.rest[key] = parameters[match.group(1)]
292 if match and match.group(1) not in parameters:
293 warnings.warn(
294 f"config {key} contains value {match.group(0)} which is formatted like a "
295 "Pipeline parameter but was not found within the Pipeline, if this was not "
296 "intentional, check for a typo"
297 )
298 return new_config
300 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]:
301 """Merges another instance of a `ConfigIR` into this instance if
302 possible. This function returns a generator that is either self
303 if the configs were merged, or self, and other_config if that could
304 not be merged.
306 Parameters
307 ----------
308 other_config : `ConfigIR`
309 An instance of `ConfigIR` to merge into this instance.
311 Returns
312 -------
313 Generator : `ConfigIR`
314 A generator containing either self, or self and other_config if
315 the configs could be merged or not respectively.
316 """
317 # Verify that the config blocks can be merged
318 if (
319 self.dataId != other_config.dataId
320 or self.python
321 or other_config.python
322 or self.file
323 or other_config.file
324 ):
325 yield from (self, other_config)
326 return
328 # create a set of all keys, and verify two keys do not have different
329 # values
330 key_union = self.rest.keys() & other_config.rest.keys()
331 for key in key_union:
332 if self.rest[key] != other_config.rest[key]:
333 yield from (self, other_config)
334 return
335 self.rest.update(other_config.rest)
337 # Combine the lists of override files to load
338 self_file_set = set(self.file)
339 other_file_set = set(other_config.file)
340 self.file = list(self_file_set.union(other_file_set))
342 yield self
344 def __eq__(self, other: object) -> bool:
345 if not isinstance(other, ConfigIR):
346 return False
347 elif all(
348 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest")
349 ):
350 return True
351 else:
352 return False
355@dataclass
356class TaskIR:
357 """Intermediate representation of tasks read from a pipeline yaml file."""
359 label: str
360 """An identifier used to refer to a task.
361 """
362 klass: str
363 """A string containing a fully qualified python class to be run in a
364 pipeline.
365 """
366 config: Union[List[ConfigIR], None] = None
367 """List of all configs overrides associated with this task, and may be
368 `None` if there are no config overrides.
369 """
371 def to_primitives(self) -> Dict[str, Union[str, List[dict]]]:
372 """Convert to a representation used in yaml serialization"""
373 accumulate: Dict[str, Union[str, List[dict]]] = {"class": self.klass}
374 if self.config:
375 accumulate["config"] = [c.to_primitives() for c in self.config]
376 return accumulate
378 def add_or_update_config(self, other_config: ConfigIR) -> None:
379 """Adds a `ConfigIR` to this task if one is not present. Merges configs
380 if there is a `ConfigIR` present and the dataId keys of both configs
381 match, otherwise adds a new entry to the config list. The exception to
382 the above is that if either the last config or other_config has a
383 python block, then other_config is always added, as python blocks can
384 modify configs in ways that cannot be predicted.
386 Parameters
387 ----------
388 other_config : `ConfigIR`
389 A `ConfigIR` instance to add or merge into the config attribute of
390 this task.
391 """
392 if not self.config:
393 self.config = [other_config]
394 return
395 self.config.extend(self.config.pop().maybe_merge(other_config))
397 def __eq__(self, other: object) -> bool:
398 if not isinstance(other, TaskIR):
399 return False
400 elif all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")):
401 return True
402 else:
403 return False
406@dataclass
407class ImportIR:
408 """An intermediate representation of imported pipelines"""
410 location: str
411 """This is the location of the pipeline to inherit. The path should be
412 specified as an absolute path. Environment variables may be used in the
413 path and should be specified as a python string template, with the name of
414 the environment variable inside braces.
415 """
416 include: Union[List[str], None] = None
417 """List of tasks that should be included when inheriting this pipeline.
418 Either the include or exclude attributes may be specified, but not both.
419 """
420 exclude: Union[List[str], None] = None
421 """List of tasks that should be excluded when inheriting this pipeline.
422 Either the include or exclude attributes may be specified, but not both.
423 """
424 importContracts: bool = True
425 """Boolean attribute to dictate if contracts should be inherited with the
426 pipeline or not.
427 """
428 instrument: Union[Literal[_Tags.KeepInstrument], str, None] = _Tags.KeepInstrument
429 """Instrument to assign to the Pipeline at import. The default value of
430 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is
431 declared with will not be modified. Setting this value to None will drop
432 any declared instrument prior to import.
433 """
435 def toPipelineIR(self) -> "PipelineIR":
436 """Load in the Pipeline specified by this object, and turn it into a
437 PipelineIR instance.
439 Returns
440 -------
441 pipeline : `PipelineIR`
442 A pipeline generated from the imported pipeline file
443 """
444 if self.include and self.exclude:
445 raise ValueError(
446 "Both an include and an exclude list cant be specified when declaring a pipeline import"
447 )
448 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location))
449 if self.instrument is not _Tags.KeepInstrument:
450 tmp_pipeline.instrument = self.instrument
452 included_labels = set()
453 for label in tmp_pipeline.tasks:
454 if (
455 (self.include and label in self.include)
456 or (self.exclude and label not in self.exclude)
457 or (self.include is None and self.exclude is None)
458 ):
459 included_labels.add(label)
461 # Handle labeled subsets being specified in the include or exclude
462 # list, adding or removing labels.
463 if self.include is not None:
464 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include
465 for label in subsets_in_include:
466 included_labels.update(tmp_pipeline.labeled_subsets[label].subset)
468 elif self.exclude is not None:
469 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude
470 for label in subsets_in_exclude:
471 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset)
473 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels)
475 if not self.importContracts:
476 tmp_pipeline.contracts = []
478 return tmp_pipeline
480 def __eq__(self, other: object) -> bool:
481 if not isinstance(other, ImportIR):
482 return False
483 elif all(
484 getattr(self, attr) == getattr(other, attr)
485 for attr in ("location", "include", "exclude", "importContracts")
486 ):
487 return True
488 else:
489 return False
492class PipelineIR:
493 """Intermediate representation of a pipeline definition
495 Parameters
496 ----------
497 loaded_yaml : `dict`
498 A dictionary which matches the structure that would be produced by a
499 yaml reader which parses a pipeline definition document
501 Raises
502 ------
503 ValueError
504 Raised if:
506 - a pipeline is declared without a description;
507 - no tasks are declared in a pipeline, and no pipelines are to be
508 inherited;
509 - more than one instrument is specified;
510 - more than one inherited pipeline share a label.
511 """
513 def __init__(self, loaded_yaml: Dict[str, Any]):
514 # Check required fields are present
515 if "description" not in loaded_yaml:
516 raise ValueError("A pipeline must be declared with a description")
517 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2:
518 raise ValueError("A pipeline must be declared with one or more tasks")
520 # These steps below must happen in this call order
522 # Process pipeline description
523 self.description = loaded_yaml.pop("description")
525 # Process tasks
526 self._read_tasks(loaded_yaml)
528 # Process instrument keys
529 inst = loaded_yaml.pop("instrument", None)
530 if isinstance(inst, list):
531 raise ValueError("Only one top level instrument can be defined in a pipeline")
532 self.instrument: Optional[str] = inst
534 # Process any contracts
535 self._read_contracts(loaded_yaml)
537 # Process any defined parameters
538 self._read_parameters(loaded_yaml)
540 # Process any named label subsets
541 self._read_labeled_subsets(loaded_yaml)
543 # Process any inherited pipelines
544 self._read_imports(loaded_yaml)
546 # verify named subsets, must be done after inheriting
547 self._verify_labeled_subsets()
549 def _read_contracts(self, loaded_yaml: Dict[str, Any]) -> None:
550 """Process the contracts portion of the loaded yaml document
552 Parameters
553 ---------
554 loaded_yaml : `dict`
555 A dictionary which matches the structure that would be produced by
556 a yaml reader which parses a pipeline definition document
557 """
558 loaded_contracts = loaded_yaml.pop("contracts", [])
559 if isinstance(loaded_contracts, str):
560 loaded_contracts = [loaded_contracts]
561 self.contracts: List[ContractIR] = []
562 for contract in loaded_contracts:
563 if isinstance(contract, dict):
564 self.contracts.append(ContractIR(**contract))
565 if isinstance(contract, str):
566 self.contracts.append(ContractIR(contract=contract))
568 def _read_parameters(self, loaded_yaml: Dict[str, Any]) -> None:
569 """Process the parameters portion of the loaded yaml document
571 Parameters
572 ---------
573 loaded_yaml : `dict`
574 A dictionary which matches the structure that would be produced by
575 a yaml reader which parses a pipeline definition document
576 """
577 loaded_parameters = loaded_yaml.pop("parameters", {})
578 if not isinstance(loaded_parameters, dict):
579 raise ValueError("The parameters section must be a yaml mapping")
580 self.parameters = ParametersIR(loaded_parameters)
582 def _read_labeled_subsets(self, loaded_yaml: Dict[str, Any]) -> None:
583 """Process the subsets portion of the loaded yaml document
585 Parameters
586 ----------
587 loaded_yaml: `MutableMapping`
588 A dictionary which matches the structure that would be produced
589 by a yaml reader which parses a pipeline definition document
590 """
591 loaded_subsets = loaded_yaml.pop("subsets", {})
592 self.labeled_subsets: Dict[str, LabeledSubset] = {}
593 if not loaded_subsets and "subset" in loaded_yaml:
594 raise ValueError("Top level key should be subsets and not subset, add an s")
595 for key, value in loaded_subsets.items():
596 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value)
598 def _verify_labeled_subsets(self) -> None:
599 """Verifies that all the labels in each named subset exist within the
600 pipeline.
601 """
602 # Verify that all labels defined in a labeled subset are in the
603 # Pipeline
604 for labeled_subset in self.labeled_subsets.values():
605 if not labeled_subset.subset.issubset(self.tasks.keys()):
606 raise ValueError(
607 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the "
608 "declared pipeline"
609 )
610 # Verify subset labels are not already task labels
611 label_intersection = self.labeled_subsets.keys() & self.tasks.keys()
612 if label_intersection:
613 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}")
615 def _read_imports(self, loaded_yaml: Dict[str, Any]) -> None:
616 """Process the inherits portion of the loaded yaml document
618 Parameters
619 ---------
620 loaded_yaml : `dict`
621 A dictionary which matches the structure that would be produced by
622 a yaml reader which parses a pipeline definition document
623 """
625 def process_args(argument: Union[str, dict]) -> dict:
626 if isinstance(argument, str):
627 return {"location": argument}
628 elif isinstance(argument, dict):
629 if "exclude" in argument and isinstance(argument["exclude"], str):
630 argument["exclude"] = [argument["exclude"]]
631 if "include" in argument and isinstance(argument["include"], str):
632 argument["include"] = [argument["include"]]
633 if "instrument" in argument and argument["instrument"] == "None":
634 argument["instrument"] = None
635 return argument
637 if not {"inherits", "imports"} - loaded_yaml.keys():
638 raise ValueError("Cannot define both inherits and imports sections, use imports")
639 tmp_import = loaded_yaml.pop("inherits", None)
640 if tmp_import is None:
641 tmp_import = loaded_yaml.pop("imports", None)
642 else:
643 warnings.warn(
644 "The 'inherits' key is deprecated, and will be "
645 "removed around June 2021. Please use the key "
646 "'imports' instead"
647 )
648 if tmp_import is None:
649 self.imports: List[ImportIR] = []
650 elif isinstance(tmp_import, list):
651 self.imports = [ImportIR(**process_args(args)) for args in tmp_import]
652 else:
653 self.imports = [ImportIR(**process_args(tmp_import))]
655 # integrate any imported pipelines
656 accumulate_tasks: Dict[str, TaskIR] = {}
657 accumulate_labeled_subsets: Dict[str, LabeledSubset] = {}
658 accumulated_parameters = ParametersIR({})
659 for other_pipeline in self.imports:
660 tmp_IR = other_pipeline.toPipelineIR()
661 if self.instrument is None:
662 self.instrument = tmp_IR.instrument
663 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None:
664 msg = (
665 "Only one instrument can be declared in a pipeline or its imports. "
666 f"Top level pipeline defines {self.instrument} but {other_pipeline.location} "
667 f"defines {tmp_IR.instrument}."
668 )
669 raise ValueError(msg)
670 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys():
671 msg = (
672 "Task labels in the imported pipelines must be unique. "
673 f"These labels appear multiple times: {duplicate_labels}"
674 )
675 raise ValueError(msg)
676 accumulate_tasks.update(tmp_IR.tasks)
677 self.contracts.extend(tmp_IR.contracts)
678 # verify that tmp_IR has unique labels for named subset among
679 # existing labeled subsets, and with existing task labels.
680 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys()
681 task_subset_overlap = (
682 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys()
683 ) & accumulate_tasks.keys()
684 if overlapping_subsets or task_subset_overlap:
685 raise ValueError(
686 "Labeled subset names must be unique amongst imports in both labels and "
687 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}"
688 )
689 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets)
690 accumulated_parameters.update(tmp_IR.parameters)
692 # verify that any accumulated labeled subsets dont clash with a label
693 # from this pipeline
694 if accumulate_labeled_subsets.keys() & self.tasks.keys():
695 raise ValueError(
696 "Labeled subset names must be unique amongst imports in both labels and named Subsets"
697 )
698 # merge in the named subsets for self so this document can override any
699 # that have been delcared
700 accumulate_labeled_subsets.update(self.labeled_subsets)
701 self.labeled_subsets = accumulate_labeled_subsets
703 # merge the dict of label:TaskIR objects, preserving any configs in the
704 # imported pipeline if the labels point to the same class
705 for label, task in self.tasks.items():
706 if label not in accumulate_tasks:
707 accumulate_tasks[label] = task
708 elif accumulate_tasks[label].klass == task.klass:
709 if task.config is not None:
710 for config in task.config:
711 accumulate_tasks[label].add_or_update_config(config)
712 else:
713 accumulate_tasks[label] = task
714 self.tasks: Dict[str, TaskIR] = accumulate_tasks
715 accumulated_parameters.update(self.parameters)
716 self.parameters = accumulated_parameters
718 def _read_tasks(self, loaded_yaml: Dict[str, Any]) -> None:
719 """Process the tasks portion of the loaded yaml document
721 Parameters
722 ---------
723 loaded_yaml : `dict`
724 A dictionary which matches the structure that would be produced by
725 a yaml reader which parses a pipeline definition document
726 """
727 self.tasks = {}
728 tmp_tasks = loaded_yaml.pop("tasks", None)
729 if tmp_tasks is None:
730 tmp_tasks = {}
732 if "parameters" in tmp_tasks:
733 raise ValueError("parameters is a reserved word and cannot be used as a task label")
735 for label, definition in tmp_tasks.items():
736 if isinstance(definition, str):
737 definition = {"class": definition}
738 config = definition.get("config", None)
739 if config is None:
740 task_config_ir = None
741 else:
742 if isinstance(config, dict):
743 config = [config]
744 task_config_ir = []
745 for c in config:
746 file = c.pop("file", None)
747 if file is None:
748 file = []
749 elif not isinstance(file, list):
750 file = [file]
751 task_config_ir.append(
752 ConfigIR(
753 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c
754 )
755 )
756 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir)
758 def _remove_contracts(self, label: str) -> None:
759 """Remove any contracts that contain the given label
761 String comparison used in this way is not the most elegant and may
762 have issues, but it is the only feasible way when users can specify
763 contracts with generic strings.
764 """
765 new_contracts = []
766 for contract in self.contracts:
767 # match a label that is not preceded by an ASCII identifier, or
768 # is the start of a line and is followed by a dot
769 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract):
770 continue
771 new_contracts.append(contract)
772 self.contracts = new_contracts
774 def subset_from_labels(self, labelSpecifier: Set[str]) -> PipelineIR:
775 """Subset a pipelineIR to contain only labels specified in
776 labelSpecifier.
778 Parameters
779 ----------
780 labelSpecifier : `set` of `str`
781 Set containing labels that describes how to subset a pipeline.
783 Returns
784 -------
785 pipeline : `PipelineIR`
786 A new pipelineIR object that is a subset of the old pipelineIR
788 Raises
789 ------
790 ValueError
791 Raised if there is an issue with specified labels
793 Notes
794 -----
795 This method attempts to prune any contracts that contain labels which
796 are not in the declared subset of labels. This pruning is done using a
797 string based matching due to the nature of contracts and may prune more
798 than it should. Any labeled subsets defined that no longer have all
799 members of the subset present in the pipeline will be removed from the
800 resulting pipeline.
801 """
803 pipeline = copy.deepcopy(self)
805 # update the label specifier to expand any named subsets
806 toRemove = set()
807 toAdd = set()
808 for label in labelSpecifier:
809 if label in pipeline.labeled_subsets:
810 toRemove.add(label)
811 toAdd.update(pipeline.labeled_subsets[label].subset)
812 labelSpecifier.difference_update(toRemove)
813 labelSpecifier.update(toAdd)
814 # verify all the labels are in the pipeline
815 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets):
816 difference = labelSpecifier.difference(pipeline.tasks.keys())
817 raise ValueError(
818 "Not all supplied labels (specified or named subsets) are in the pipeline "
819 f"definition, extra labels: {difference}"
820 )
821 # copy needed so as to not modify while iterating
822 pipeline_labels = set(pipeline.tasks.keys())
823 # Remove the labels from the pipelineIR, and any contracts that contain
824 # those labels (see docstring on _remove_contracts for why this may
825 # cause issues)
826 for label in pipeline_labels:
827 if label not in labelSpecifier:
828 pipeline.tasks.pop(label)
829 pipeline._remove_contracts(label)
831 # create a copy of the object to iterate over
832 labeled_subsets = copy.copy(pipeline.labeled_subsets)
833 # remove any labeled subsets that no longer have a complete set
834 for label, labeled_subset in labeled_subsets.items():
835 if labeled_subset.subset - pipeline.tasks.keys():
836 pipeline.labeled_subsets.pop(label)
838 return pipeline
840 @classmethod
841 def from_string(cls, pipeline_string: str) -> PipelineIR:
842 """Create a `PipelineIR` object from a string formatted like a pipeline
843 document
845 Parameters
846 ----------
847 pipeline_string : `str`
848 A string that is formatted according like a pipeline document
849 """
850 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader)
851 return cls(loaded_yaml)
853 @classmethod
854 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR:
855 """Create a `PipelineIR` object from the document specified by the
856 input uri.
858 Parameters
859 ----------
860 uri: convertible to `ResourcePath`
861 Location of document to use in creating a `PipelineIR` object.
863 Returns
864 -------
865 pipelineIR : `PipelineIR`
866 The loaded pipeline
867 """
868 loaded_uri = ResourcePath(uri)
869 with loaded_uri.open("r") as buffer:
870 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader)
871 return cls(loaded_yaml)
873 def write_to_uri(
874 self,
875 uri: ResourcePathExpression,
876 ) -> None:
877 """Serialize this `PipelineIR` object into a yaml formatted string and
878 write the output to a file at the specified uri.
880 Parameters
881 ----------
882 uri: convertible to `ResourcePath`
883 Location of document to write a `PipelineIR` object.
884 """
885 with ResourcePath(uri).open("w") as buffer:
886 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper)
888 def to_primitives(self) -> Dict[str, Any]:
889 """Convert to a representation used in yaml serialization"""
890 accumulate = {"description": self.description}
891 if self.instrument is not None:
892 accumulate["instrument"] = self.instrument
893 if self.parameters:
894 accumulate["parameters"] = self._sort_by_str(self.parameters.to_primitives())
895 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()}
896 if len(self.contracts) > 0:
897 # sort contracts lexicographical order by the contract string in
898 # absence of any other ordering principle
899 contracts_list = [c.to_primitives() for c in self.contracts]
900 contracts_list.sort(key=lambda x: x["contract"])
901 accumulate["contracts"] = contracts_list
902 if self.labeled_subsets:
903 accumulate["subsets"] = self._sort_by_str(
904 {k: v.to_primitives() for k, v in self.labeled_subsets.items()}
905 )
906 return accumulate
908 def reorder_tasks(self, task_labels: List[str]) -> None:
909 """Changes the order tasks are stored internally. Useful for
910 determining the order things will appear in the serialized (or printed)
911 form.
913 Parameters
914 ----------
915 task_labels : `list` of `str`
916 A list corresponding to all the labels in the pipeline inserted in
917 the order the tasks are to be stored.
919 Raises
920 ------
921 KeyError
922 Raised if labels are supplied that are not in the pipeline, or if
923 not all labels in the pipeline were supplied in task_labels input.
924 """
925 # verify that all labels are in the input
926 _tmp_set = set(task_labels)
927 if remainder := (self.tasks.keys() - _tmp_set):
928 raise KeyError(f"Label(s) {remainder} are missing from the task label list")
929 if extra := (_tmp_set - self.tasks.keys()):
930 raise KeyError(f"Extra label(s) {extra} were in the input and are not in the pipeline")
932 newTasks = {key: self.tasks[key] for key in task_labels}
933 self.tasks = newTasks
935 @staticmethod
936 def _sort_by_str(arg: Mapping[str, Any]) -> Mapping[str, Any]:
937 keys = sorted(arg.keys())
938 return {key: arg[key] for key in keys}
940 def __str__(self) -> str:
941 """Instance formatting as how it would look in yaml representation"""
942 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper)
944 def __repr__(self) -> str:
945 """Instance formatting as how it would look in yaml representation"""
946 return str(self)
948 def __eq__(self, other: object) -> bool:
949 if not isinstance(other, PipelineIR):
950 return False
951 # special case contracts because it is a list, but order is not
952 # important
953 elif (
954 all(
955 getattr(self, attr) == getattr(other, attr)
956 for attr in ("tasks", "instrument", "labeled_subsets", "parameters")
957 )
958 and len(self.contracts) == len(other.contracts)
959 and all(c in self.contracts for c in other.contracts)
960 ):
961 return True
962 else:
963 return False