Coverage for python/lsst/pipe/base/pipelineIR.py: 21%
415 statements
« prev ^ index » next coverage.py v6.4, created at 2022-05-26 09:38 +0000
« prev ^ index » next coverage.py v6.4, created at 2022-05-26 09:38 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("ConfigIR", "ContractError", "ContractIR", "ImportIR", "PipelineIR", "TaskIR", "LabeledSubset")
25import copy
26import enum
27import os
28import re
29import warnings
30from collections import Counter
31from collections.abc import Iterable as abcIterable
32from dataclasses import dataclass, field
33from typing import Any, Dict, Generator, List, Literal, Mapping, MutableMapping, Optional, Set, Union
35import yaml
36from deprecated.sphinx import deprecated
37from lsst.resources import ResourcePath, ResourcePathExpression
40class _Tags(enum.Enum):
41 KeepInstrument = enum.auto()
44class PipelineYamlLoader(yaml.SafeLoader):
45 """This is a specialized version of yaml's SafeLoader. It checks and raises
46 an exception if it finds that there are multiple instances of the same key
47 found inside a pipeline file at a given scope.
48 """
50 def construct_mapping(self, node: yaml.Node, deep: bool = False) -> Mapping[str, Any]:
51 # do the call to super first so that it can do all the other forms of
52 # checking on this node. If you check the uniqueness of keys first
53 # it would save the work that super does in the case of a failure, but
54 # it might fail in the case that the node was the incorrect node due
55 # to a parsing error, and the resulting exception would be difficult to
56 # understand.
57 mapping = super().construct_mapping(node, deep)
58 # Check if there are any duplicate keys
59 all_keys = Counter(key_node.value for key_node, _ in node.value)
60 duplicates = {k for k, i in all_keys.items() if i != 1}
61 if duplicates:
62 raise KeyError(
63 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times"
64 )
65 return mapping
68class MultilineStringDumper(yaml.Dumper):
69 """Custom YAML dumper that makes multi-line strings use the '|'
70 continuation style instead of unreadable newlines and tons of quotes.
72 Basic approach is taken from
73 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data,
74 but is written as a Dumper subclass to make its effects non-global (vs
75 `yaml.add_representer`).
76 """
78 def represent_scalar(self, tag: str, value: Any, style: Optional[str] = None) -> yaml.ScalarNode:
79 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1:
80 style = "|"
81 return super().represent_scalar(tag, value, style)
84class ContractError(Exception):
85 """An exception that is raised when a pipeline contract is not satisfied"""
87 pass
90@dataclass
91class ContractIR:
92 """Intermediate representation of configuration contracts read from a
93 pipeline yaml file."""
95 contract: str
96 """A string of python code representing one or more conditions on configs
97 in a pipeline. This code-as-string should, once evaluated, should be True
98 if the configs are fine, and False otherwise.
99 """
100 msg: Union[str, None] = None
101 """An optional message to be shown to the user if a contract fails
102 """
104 def to_primitives(self) -> Dict[str, str]:
105 """Convert to a representation used in yaml serialization"""
106 accumulate = {"contract": self.contract}
107 if self.msg is not None:
108 accumulate["msg"] = self.msg
109 return accumulate
111 def __eq__(self, other: object) -> bool:
112 if not isinstance(other, ContractIR):
113 return False
114 elif self.contract == other.contract and self.msg == other.msg:
115 return True
116 else:
117 return False
120@dataclass
121class LabeledSubset:
122 """Intermediate representation of named subset of task labels read from
123 a pipeline yaml file.
124 """
126 label: str
127 """The label used to identify the subset of task labels.
128 """
129 subset: Set[str]
130 """A set of task labels contained in this subset.
131 """
132 description: Optional[str]
133 """A description of what this subset of tasks is intended to do
134 """
136 @staticmethod
137 def from_primitives(label: str, value: Union[List[str], dict]) -> LabeledSubset:
138 """Generate `LabeledSubset` objects given a properly formatted object
139 that as been created by a yaml loader.
141 Parameters
142 ----------
143 label : `str`
144 The label that will be used to identify this labeled subset.
145 value : `list` of `str` or `dict`
146 Object returned from loading a labeled subset section from a yaml
147 document.
149 Returns
150 -------
151 labeledSubset : `LabeledSubset`
152 A `LabeledSubset` object build from the inputs.
154 Raises
155 ------
156 ValueError
157 Raised if the value input is not properly formatted for parsing
158 """
159 if isinstance(value, MutableMapping):
160 subset = value.pop("subset", None)
161 if subset is None:
162 raise ValueError(
163 "If a labeled subset is specified as a mapping, it must contain the key 'subset'"
164 )
165 description = value.pop("description", None)
166 elif isinstance(value, abcIterable):
167 subset = value
168 description = None
169 else:
170 raise ValueError(
171 f"There was a problem parsing the labeled subset {label}, make sure the "
172 "definition is either a valid yaml list, or a mapping with keys "
173 "(subset, description) where subset points to a yaml list, and description is "
174 "associated with a string"
175 )
176 return LabeledSubset(label, set(subset), description)
178 def to_primitives(self) -> Dict[str, Union[List[str], str]]:
179 """Convert to a representation used in yaml serialization"""
180 accumulate: Dict[str, Union[List[str], str]] = {"subset": list(self.subset)}
181 if self.description is not None:
182 accumulate["description"] = self.description
183 return accumulate
186@dataclass
187class ParametersIR:
188 """Intermediate representation of parameters that are global to a pipeline
190 These parameters are specified under a top level key named `parameters`
191 and are declared as a yaml mapping. These entries can then be used inside
192 task configuration blocks to specify configuration values. They may not be
193 used in the special ``file`` or ``python`` blocks.
195 Example:
196 paramters:
197 shared_value: 14
198 tasks:
199 taskA:
200 class: modA
201 config:
202 field1: parameters.shared_value
203 taskB:
204 class: modB
205 config:
206 field2: parameters.shared_value
207 """
209 mapping: MutableMapping[str, str]
210 """A mutable mapping of identifiers as keys, and shared configuration
211 as values.
212 """
214 def update(self, other: Optional[ParametersIR]) -> None:
215 if other is not None:
216 self.mapping.update(other.mapping)
218 def to_primitives(self) -> MutableMapping[str, str]:
219 """Convert to a representation used in yaml serialization"""
220 return self.mapping
222 def __contains__(self, value: str) -> bool:
223 return value in self.mapping
225 def __getitem__(self, item: str) -> Any:
226 return self.mapping[item]
228 def __bool__(self) -> bool:
229 return bool(self.mapping)
232@dataclass
233class ConfigIR:
234 """Intermediate representation of configurations read from a pipeline yaml
235 file.
236 """
238 python: Union[str, None] = None
239 """A string of python code that is used to modify a configuration. This can
240 also be None if there are no modifications to do.
241 """
242 dataId: Union[dict, None] = None
243 """A dataId that is used to constrain these config overrides to only quanta
244 with matching dataIds. This field can be None if there is no constraint.
245 This is currently an unimplemented feature, and is placed here for future
246 use.
247 """
248 file: List[str] = field(default_factory=list)
249 """A list of paths which points to a file containing config overrides to be
250 applied. This value may be an empty list if there are no overrides to
251 apply.
252 """
253 rest: dict = field(default_factory=dict)
254 """This is a dictionary of key value pairs, where the keys are strings
255 corresponding to qualified fields on a config to override, and the values
256 are strings representing the values to apply.
257 """
259 def to_primitives(self) -> Dict[str, Union[str, dict, List[str]]]:
260 """Convert to a representation used in yaml serialization"""
261 accumulate = {}
262 for name in ("python", "dataId", "file"):
263 # if this attribute is thruthy add it to the accumulation
264 # dictionary
265 if getattr(self, name):
266 accumulate[name] = getattr(self, name)
267 # Add the dictionary containing the rest of the config keys to the
268 # # accumulated dictionary
269 accumulate.update(self.rest)
270 return accumulate
272 def formatted(self, parameters: ParametersIR) -> ConfigIR:
273 """Returns a new ConfigIR object that is formatted according to the
274 specified parameters
276 Parameters
277 ----------
278 parameters : ParametersIR
279 Object that contains variable mappings used in substitution.
281 Returns
282 -------
283 config : ConfigIR
284 A new ConfigIR object formatted with the input parameters
285 """
286 new_config = copy.deepcopy(self)
287 for key, value in new_config.rest.items():
288 if not isinstance(value, str):
289 continue
290 match = re.match("parameters[.](.*)", value)
291 if match and match.group(1) in parameters:
292 new_config.rest[key] = parameters[match.group(1)]
293 if match and match.group(1) not in parameters:
294 warnings.warn(
295 f"config {key} contains value {match.group(0)} which is formatted like a "
296 "Pipeline parameter but was not found within the Pipeline, if this was not "
297 "intentional, check for a typo"
298 )
299 return new_config
301 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]:
302 """Merges another instance of a `ConfigIR` into this instance if
303 possible. This function returns a generator that is either self
304 if the configs were merged, or self, and other_config if that could
305 not be merged.
307 Parameters
308 ----------
309 other_config : `ConfigIR`
310 An instance of `ConfigIR` to merge into this instance.
312 Returns
313 -------
314 Generator : `ConfigIR`
315 A generator containing either self, or self and other_config if
316 the configs could be merged or not respectively.
317 """
318 # Verify that the config blocks can be merged
319 if (
320 self.dataId != other_config.dataId
321 or self.python
322 or other_config.python
323 or self.file
324 or other_config.file
325 ):
326 yield from (self, other_config)
327 return
329 # create a set of all keys, and verify two keys do not have different
330 # values
331 key_union = self.rest.keys() & other_config.rest.keys()
332 for key in key_union:
333 if self.rest[key] != other_config.rest[key]:
334 yield from (self, other_config)
335 return
336 self.rest.update(other_config.rest)
338 # Combine the lists of override files to load
339 self_file_set = set(self.file)
340 other_file_set = set(other_config.file)
341 self.file = list(self_file_set.union(other_file_set))
343 yield self
345 def __eq__(self, other: object) -> bool:
346 if not isinstance(other, ConfigIR):
347 return False
348 elif all(
349 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest")
350 ):
351 return True
352 else:
353 return False
356@dataclass
357class TaskIR:
358 """Intermediate representation of tasks read from a pipeline yaml file."""
360 label: str
361 """An identifier used to refer to a task.
362 """
363 klass: str
364 """A string containing a fully qualified python class to be run in a
365 pipeline.
366 """
367 config: Union[List[ConfigIR], None] = None
368 """List of all configs overrides associated with this task, and may be
369 `None` if there are no config overrides.
370 """
372 def to_primitives(self) -> Dict[str, Union[str, List[dict]]]:
373 """Convert to a representation used in yaml serialization"""
374 accumulate: Dict[str, Union[str, List[dict]]] = {"class": self.klass}
375 if self.config:
376 accumulate["config"] = [c.to_primitives() for c in self.config]
377 return accumulate
379 def add_or_update_config(self, other_config: ConfigIR) -> None:
380 """Adds a `ConfigIR` to this task if one is not present. Merges configs
381 if there is a `ConfigIR` present and the dataId keys of both configs
382 match, otherwise adds a new entry to the config list. The exception to
383 the above is that if either the last config or other_config has a
384 python block, then other_config is always added, as python blocks can
385 modify configs in ways that cannot be predicted.
387 Parameters
388 ----------
389 other_config : `ConfigIR`
390 A `ConfigIR` instance to add or merge into the config attribute of
391 this task.
392 """
393 if not self.config:
394 self.config = [other_config]
395 return
396 self.config.extend(self.config.pop().maybe_merge(other_config))
398 def __eq__(self, other: object) -> bool:
399 if not isinstance(other, TaskIR):
400 return False
401 elif all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config")):
402 return True
403 else:
404 return False
407@dataclass
408class ImportIR:
409 """An intermediate representation of imported pipelines"""
411 location: str
412 """This is the location of the pipeline to inherit. The path should be
413 specified as an absolute path. Environment variables may be used in the
414 path and should be specified as a python string template, with the name of
415 the environment variable inside braces.
416 """
417 include: Union[List[str], None] = None
418 """List of tasks that should be included when inheriting this pipeline.
419 Either the include or exclude attributes may be specified, but not both.
420 """
421 exclude: Union[List[str], None] = None
422 """List of tasks that should be excluded when inheriting this pipeline.
423 Either the include or exclude attributes may be specified, but not both.
424 """
425 importContracts: bool = True
426 """Boolean attribute to dictate if contracts should be inherited with the
427 pipeline or not.
428 """
429 instrument: Union[Literal[_Tags.KeepInstrument], str, None] = _Tags.KeepInstrument
430 """Instrument to assign to the Pipeline at import. The default value of
431 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is
432 declared with will not be modified. Setting this value to None will drop
433 any declared instrument prior to import.
434 """
436 def toPipelineIR(self) -> "PipelineIR":
437 """Load in the Pipeline specified by this object, and turn it into a
438 PipelineIR instance.
440 Returns
441 -------
442 pipeline : `PipelineIR`
443 A pipeline generated from the imported pipeline file
444 """
445 if self.include and self.exclude:
446 raise ValueError(
447 "Both an include and an exclude list cant be specified when declaring a pipeline import"
448 )
449 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location))
450 if self.instrument is not _Tags.KeepInstrument:
451 tmp_pipeline.instrument = self.instrument
453 included_labels = set()
454 for label in tmp_pipeline.tasks:
455 if (
456 (self.include and label in self.include)
457 or (self.exclude and label not in self.exclude)
458 or (self.include is None and self.exclude is None)
459 ):
460 included_labels.add(label)
462 # Handle labeled subsets being specified in the include or exclude
463 # list, adding or removing labels.
464 if self.include is not None:
465 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include
466 for label in subsets_in_include:
467 included_labels.update(tmp_pipeline.labeled_subsets[label].subset)
469 elif self.exclude is not None:
470 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude
471 for label in subsets_in_exclude:
472 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset)
474 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels)
476 if not self.importContracts:
477 tmp_pipeline.contracts = []
479 return tmp_pipeline
481 def __eq__(self, other: object) -> bool:
482 if not isinstance(other, ImportIR):
483 return False
484 elif all(
485 getattr(self, attr) == getattr(other, attr)
486 for attr in ("location", "include", "exclude", "importContracts")
487 ):
488 return True
489 else:
490 return False
493class PipelineIR:
494 """Intermediate representation of a pipeline definition
496 Parameters
497 ----------
498 loaded_yaml : `dict`
499 A dictionary which matches the structure that would be produced by a
500 yaml reader which parses a pipeline definition document
502 Raises
503 ------
504 ValueError
505 Raised if:
506 - a pipeline is declared without a description;
507 - no tasks are declared in a pipeline, and no pipelines are to be
508 inherited;
509 - more than one instrument is specified;
510 - more than one inherited pipeline share a label.
511 """
513 def __init__(self, loaded_yaml: Dict[str, Any]):
514 # Check required fields are present
515 if "description" not in loaded_yaml:
516 raise ValueError("A pipeline must be declared with a description")
517 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2:
518 raise ValueError("A pipeline must be declared with one or more tasks")
520 # These steps below must happen in this call order
522 # Process pipeline description
523 self.description = loaded_yaml.pop("description")
525 # Process tasks
526 self._read_tasks(loaded_yaml)
528 # Process instrument keys
529 inst = loaded_yaml.pop("instrument", None)
530 if isinstance(inst, list):
531 raise ValueError("Only one top level instrument can be defined in a pipeline")
532 self.instrument: Optional[str] = inst
534 # Process any contracts
535 self._read_contracts(loaded_yaml)
537 # Process any defined parameters
538 self._read_parameters(loaded_yaml)
540 # Process any named label subsets
541 self._read_labeled_subsets(loaded_yaml)
543 # Process any inherited pipelines
544 self._read_imports(loaded_yaml)
546 # verify named subsets, must be done after inheriting
547 self._verify_labeled_subsets()
549 def _read_contracts(self, loaded_yaml: Dict[str, Any]) -> None:
550 """Process the contracts portion of the loaded yaml document
552 Parameters
553 ---------
554 loaded_yaml : `dict`
555 A dictionary which matches the structure that would be produced by
556 a yaml reader which parses a pipeline definition document
557 """
558 loaded_contracts = loaded_yaml.pop("contracts", [])
559 if isinstance(loaded_contracts, str):
560 loaded_contracts = [loaded_contracts]
561 self.contracts: List[ContractIR] = []
562 for contract in loaded_contracts:
563 if isinstance(contract, dict):
564 self.contracts.append(ContractIR(**contract))
565 if isinstance(contract, str):
566 self.contracts.append(ContractIR(contract=contract))
568 def _read_parameters(self, loaded_yaml: Dict[str, Any]) -> None:
569 """Process the parameters portion of the loaded yaml document
571 Parameters
572 ---------
573 loaded_yaml : `dict`
574 A dictionary which matches the structure that would be produced by
575 a yaml reader which parses a pipeline definition document
576 """
577 loaded_parameters = loaded_yaml.pop("parameters", {})
578 if not isinstance(loaded_parameters, dict):
579 raise ValueError("The parameters section must be a yaml mapping")
580 self.parameters = ParametersIR(loaded_parameters)
582 def _read_labeled_subsets(self, loaded_yaml: Dict[str, Any]) -> None:
583 """Process the subsets portion of the loaded yaml document
585 Parameters
586 ----------
587 loaded_yaml: `MutableMapping`
588 A dictionary which matches the structure that would be produced
589 by a yaml reader which parses a pipeline definition document
590 """
591 loaded_subsets = loaded_yaml.pop("subsets", {})
592 self.labeled_subsets: Dict[str, LabeledSubset] = {}
593 if not loaded_subsets and "subset" in loaded_yaml:
594 raise ValueError("Top level key should be subsets and not subset, add an s")
595 for key, value in loaded_subsets.items():
596 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value)
598 def _verify_labeled_subsets(self) -> None:
599 """Verifies that all the labels in each named subset exist within the
600 pipeline.
601 """
602 # Verify that all labels defined in a labeled subset are in the
603 # Pipeline
604 for labeled_subset in self.labeled_subsets.values():
605 if not labeled_subset.subset.issubset(self.tasks.keys()):
606 raise ValueError(
607 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the "
608 "declared pipeline"
609 )
610 # Verify subset labels are not already task labels
611 label_intersection = self.labeled_subsets.keys() & self.tasks.keys()
612 if label_intersection:
613 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}")
615 def _read_imports(self, loaded_yaml: Dict[str, Any]) -> None:
616 """Process the inherits portion of the loaded yaml document
618 Parameters
619 ---------
620 loaded_yaml : `dict`
621 A dictionary which matches the structure that would be produced by
622 a yaml reader which parses a pipeline definition document
623 """
625 def process_args(argument: Union[str, dict]) -> dict:
626 if isinstance(argument, str):
627 return {"location": argument}
628 elif isinstance(argument, dict):
629 if "exclude" in argument and isinstance(argument["exclude"], str):
630 argument["exclude"] = [argument["exclude"]]
631 if "include" in argument and isinstance(argument["include"], str):
632 argument["include"] = [argument["include"]]
633 if "instrument" in argument and argument["instrument"] == "None":
634 argument["instrument"] = None
635 return argument
637 if not {"inherits", "imports"} - loaded_yaml.keys():
638 raise ValueError("Cannot define both inherits and imports sections, use imports")
639 tmp_import = loaded_yaml.pop("inherits", None)
640 if tmp_import is None:
641 tmp_import = loaded_yaml.pop("imports", None)
642 else:
643 warnings.warn(
644 "The 'inherits' key is deprecated, and will be "
645 "removed around June 2021. Please use the key "
646 "'imports' instead"
647 )
648 if tmp_import is None:
649 self.imports: List[ImportIR] = []
650 elif isinstance(tmp_import, list):
651 self.imports = [ImportIR(**process_args(args)) for args in tmp_import]
652 else:
653 self.imports = [ImportIR(**process_args(tmp_import))]
655 # integrate any imported pipelines
656 accumulate_tasks: Dict[str, TaskIR] = {}
657 accumulate_labeled_subsets: Dict[str, LabeledSubset] = {}
658 accumulated_parameters = ParametersIR({})
659 for other_pipeline in self.imports:
660 tmp_IR = other_pipeline.toPipelineIR()
661 if self.instrument is None:
662 self.instrument = tmp_IR.instrument
663 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None:
664 msg = (
665 "Only one instrument can be declared in a pipeline or its imports. "
666 f"Top level pipeline defines {self.instrument} but {other_pipeline.location} "
667 f"defines {tmp_IR.instrument}."
668 )
669 raise ValueError(msg)
670 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys():
671 msg = (
672 "Task labels in the imported pipelines must be unique. "
673 f"These labels appear multiple times: {duplicate_labels}"
674 )
675 raise ValueError(msg)
676 accumulate_tasks.update(tmp_IR.tasks)
677 self.contracts.extend(tmp_IR.contracts)
678 # verify that tmp_IR has unique labels for named subset among
679 # existing labeled subsets, and with existing task labels.
680 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys()
681 task_subset_overlap = (
682 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys()
683 ) & accumulate_tasks.keys()
684 if overlapping_subsets or task_subset_overlap:
685 raise ValueError(
686 "Labeled subset names must be unique amongst imports in both labels and "
687 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}"
688 )
689 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets)
690 accumulated_parameters.update(tmp_IR.parameters)
692 # verify that any accumulated labeled subsets dont clash with a label
693 # from this pipeline
694 if accumulate_labeled_subsets.keys() & self.tasks.keys():
695 raise ValueError(
696 "Labeled subset names must be unique amongst imports in both labels and named Subsets"
697 )
698 # merge in the named subsets for self so this document can override any
699 # that have been delcared
700 accumulate_labeled_subsets.update(self.labeled_subsets)
701 self.labeled_subsets = accumulate_labeled_subsets
703 # merge the dict of label:TaskIR objects, preserving any configs in the
704 # imported pipeline if the labels point to the same class
705 for label, task in self.tasks.items():
706 if label not in accumulate_tasks:
707 accumulate_tasks[label] = task
708 elif accumulate_tasks[label].klass == task.klass:
709 if task.config is not None:
710 for config in task.config:
711 accumulate_tasks[label].add_or_update_config(config)
712 else:
713 accumulate_tasks[label] = task
714 self.tasks: Dict[str, TaskIR] = accumulate_tasks
715 accumulated_parameters.update(self.parameters)
716 self.parameters = accumulated_parameters
718 def _read_tasks(self, loaded_yaml: Dict[str, Any]) -> None:
719 """Process the tasks portion of the loaded yaml document
721 Parameters
722 ---------
723 loaded_yaml : `dict`
724 A dictionary which matches the structure that would be produced by
725 a yaml reader which parses a pipeline definition document
726 """
727 self.tasks = {}
728 tmp_tasks = loaded_yaml.pop("tasks", None)
729 if tmp_tasks is None:
730 tmp_tasks = {}
732 if "parameters" in tmp_tasks:
733 raise ValueError("parameters is a reserved word and cannot be used as a task label")
735 for label, definition in tmp_tasks.items():
736 if isinstance(definition, str):
737 definition = {"class": definition}
738 config = definition.get("config", None)
739 if config is None:
740 task_config_ir = None
741 else:
742 if isinstance(config, dict):
743 config = [config]
744 task_config_ir = []
745 for c in config:
746 file = c.pop("file", None)
747 if file is None:
748 file = []
749 elif not isinstance(file, list):
750 file = [file]
751 task_config_ir.append(
752 ConfigIR(
753 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c
754 )
755 )
756 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir)
758 def _remove_contracts(self, label: str) -> None:
759 """Remove any contracts that contain the given label
761 String comparison used in this way is not the most elegant and may
762 have issues, but it is the only feasible way when users can specify
763 contracts with generic strings.
764 """
765 new_contracts = []
766 for contract in self.contracts:
767 # match a label that is not preceded by an ASCII identifier, or
768 # is the start of a line and is followed by a dot
769 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract):
770 continue
771 new_contracts.append(contract)
772 self.contracts = new_contracts
774 def subset_from_labels(self, labelSpecifier: Set[str]) -> PipelineIR:
775 """Subset a pipelineIR to contain only labels specified in
776 labelSpecifier.
778 Parameters
779 ----------
780 labelSpecifier : `set` of `str`
781 Set containing labels that describes how to subset a pipeline.
783 Returns
784 -------
785 pipeline : `PipelineIR`
786 A new pipelineIR object that is a subset of the old pipelineIR
788 Raises
789 ------
790 ValueError
791 Raised if there is an issue with specified labels
793 Notes
794 -----
795 This method attempts to prune any contracts that contain labels which
796 are not in the declared subset of labels. This pruning is done using a
797 string based matching due to the nature of contracts and may prune more
798 than it should. Any labeled subsets defined that no longer have all
799 members of the subset present in the pipeline will be removed from the
800 resulting pipeline.
801 """
803 pipeline = copy.deepcopy(self)
805 # update the label specifier to expand any named subsets
806 toRemove = set()
807 toAdd = set()
808 for label in labelSpecifier:
809 if label in pipeline.labeled_subsets:
810 toRemove.add(label)
811 toAdd.update(pipeline.labeled_subsets[label].subset)
812 labelSpecifier.difference_update(toRemove)
813 labelSpecifier.update(toAdd)
814 # verify all the labels are in the pipeline
815 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets):
816 difference = labelSpecifier.difference(pipeline.tasks.keys())
817 raise ValueError(
818 "Not all supplied labels (specified or named subsets) are in the pipeline "
819 f"definition, extra labels: {difference}"
820 )
821 # copy needed so as to not modify while iterating
822 pipeline_labels = set(pipeline.tasks.keys())
823 # Remove the labels from the pipelineIR, and any contracts that contain
824 # those labels (see docstring on _remove_contracts for why this may
825 # cause issues)
826 for label in pipeline_labels:
827 if label not in labelSpecifier:
828 pipeline.tasks.pop(label)
829 pipeline._remove_contracts(label)
831 # create a copy of the object to iterate over
832 labeled_subsets = copy.copy(pipeline.labeled_subsets)
833 # remove any labeled subsets that no longer have a complete set
834 for label, labeled_subset in labeled_subsets.items():
835 if labeled_subset.subset - pipeline.tasks.keys():
836 pipeline.labeled_subsets.pop(label)
838 return pipeline
840 @classmethod
841 def from_string(cls, pipeline_string: str) -> PipelineIR:
842 """Create a `PipelineIR` object from a string formatted like a pipeline
843 document
845 Parameters
846 ----------
847 pipeline_string : `str`
848 A string that is formatted according like a pipeline document
849 """
850 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader)
851 return cls(loaded_yaml)
853 @classmethod
854 @deprecated(
855 reason="This has been replaced with `from_uri`. will be removed after v23",
856 version="v21.0,",
857 category=FutureWarning,
858 )
859 def from_file(cls, filename: str) -> PipelineIR:
860 """Create a `PipelineIR` object from the document specified by the
861 input path.
863 Parameters
864 ----------
865 filename : `str`
866 Location of document to use in creating a `PipelineIR` object.
868 Returns
869 -------
870 pipelineIR : `PipelineIR`
871 The loaded pipeline
873 Note
874 ----
875 This method is deprecated, please use from_uri
876 """
877 return cls.from_uri(filename)
879 @classmethod
880 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR:
881 """Create a `PipelineIR` object from the document specified by the
882 input uri.
884 Parameters
885 ----------
886 uri: convertible to `ResourcePath`
887 Location of document to use in creating a `PipelineIR` object.
889 Returns
890 -------
891 pipelineIR : `PipelineIR`
892 The loaded pipeline
893 """
894 loaded_uri = ResourcePath(uri)
895 with loaded_uri.open("r") as buffer:
896 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader)
897 return cls(loaded_yaml)
899 @deprecated(
900 reason="This has been replaced with `write_to_uri`. will be removed after v23",
901 version="v21.0,",
902 category=FutureWarning,
903 ) # type: ignore
904 def to_file(self, filename: str):
905 """Serialize this `PipelineIR` object into a yaml formatted string and
906 write the output to a file at the specified path.
908 Parameters
909 ----------
910 filename : `str`
911 Location of document to write a `PipelineIR` object.
912 """
913 self.write_to_uri(filename)
915 def write_to_uri(
916 self,
917 uri: ResourcePathExpression,
918 ) -> None:
919 """Serialize this `PipelineIR` object into a yaml formatted string and
920 write the output to a file at the specified uri.
922 Parameters
923 ----------
924 uri: convertible to `ResourcePath`
925 Location of document to write a `PipelineIR` object.
926 """
927 with ResourcePath(uri).open("w") as buffer:
928 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper)
930 def to_primitives(self) -> Dict[str, Any]:
931 """Convert to a representation used in yaml serialization"""
932 accumulate = {"description": self.description}
933 if self.instrument is not None:
934 accumulate["instrument"] = self.instrument
935 if self.parameters:
936 accumulate["parameters"] = self._sort_by_str(self.parameters.to_primitives())
937 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()}
938 if len(self.contracts) > 0:
939 # sort contracts lexicographical order by the contract string in
940 # absence of any other ordering principle
941 contracts_list = [c.to_primitives() for c in self.contracts]
942 contracts_list.sort(key=lambda x: x["contract"])
943 accumulate["contracts"] = contracts_list
944 if self.labeled_subsets:
945 accumulate["subsets"] = self._sort_by_str(
946 {k: v.to_primitives() for k, v in self.labeled_subsets.items()}
947 )
948 return accumulate
950 def reorder_tasks(self, task_labels: List[str]) -> None:
951 """Changes the order tasks are stored internally. Useful for
952 determining the order things will appear in the serialized (or printed)
953 form.
955 Parameters
956 ----------
957 task_labels : `list` of `str`
958 A list corresponding to all the labels in the pipeline inserted in
959 the order the tasks are to be stored.
961 Raises
962 ------
963 KeyError
964 Raised if labels are supplied that are not in the pipeline, or if
965 not all labels in the pipeline were supplied in task_labels input.
966 """
967 # verify that all labels are in the input
968 _tmp_set = set(task_labels)
969 if remainder := (self.tasks.keys() - _tmp_set):
970 raise KeyError(f"Label(s) {remainder} are missing from the task label list")
971 if extra := (_tmp_set - self.tasks.keys()):
972 raise KeyError(f"Extra label(s) {extra} were in the input and are not in the pipeline")
974 newTasks = {key: self.tasks[key] for key in task_labels}
975 self.tasks = newTasks
977 @staticmethod
978 def _sort_by_str(arg: Mapping[str, Any]) -> Mapping[str, Any]:
979 keys = sorted(arg.keys())
980 return {key: arg[key] for key in keys}
982 def __str__(self) -> str:
983 """Instance formatting as how it would look in yaml representation"""
984 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper)
986 def __repr__(self) -> str:
987 """Instance formatting as how it would look in yaml representation"""
988 return str(self)
990 def __eq__(self, other: object) -> bool:
991 if not isinstance(other, PipelineIR):
992 return False
993 # special case contracts because it is a list, but order is not
994 # important
995 elif (
996 all(
997 getattr(self, attr) == getattr(other, attr)
998 for attr in ("tasks", "instrument", "labeled_subsets", "parameters")
999 )
1000 and len(self.contracts) == len(other.contracts)
1001 and all(c in self.contracts for c in other.contracts)
1002 ):
1003 return True
1004 else:
1005 return False