Coverage for python/lsst/pipe/base/pipelineIR.py: 21%
448 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-02 03:31 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-02 03:31 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = (
30 "ConfigIR",
31 "ContractError",
32 "ContractIR",
33 "ImportIR",
34 "LabeledSubset",
35 "ParametersIR",
36 "PipelineIR",
37 "TaskIR",
38)
40import copy
41import enum
42import os
43import re
44import warnings
45from collections import Counter
46from collections.abc import Generator, Hashable, Iterable, MutableMapping
47from dataclasses import dataclass, field
48from typing import Any, Literal
50import yaml
51from lsst.resources import ResourcePath, ResourcePathExpression
52from lsst.utils.introspection import find_outside_stacklevel
55class PipelineSubsetCtrl(enum.Enum):
56 """An Enumeration of the various ways a pipeline subsetting operation will
57 handle labeled subsets when task labels they defined are missing.
58 """
60 DROP = enum.auto()
61 """Drop any subsets that contain labels which are no longer in the set of
62 task labels when subsetting an entire pipeline
63 """
64 EDIT = enum.auto()
65 """Edit any subsets that contain labels which are no longer in the set of
66 task labels to remove the missing label, but leave the subset when
67 subsetting a pipeline.
68 """
71class _Tags(enum.Enum):
72 KeepInstrument = enum.auto()
75class PipelineYamlLoader(yaml.SafeLoader):
76 """Specialized version of yaml's SafeLoader.
78 It checks and raises an exception if it finds that there are multiple
79 instances of the same key found inside a pipeline file at a given scope.
80 """
82 def construct_mapping(self, node: yaml.MappingNode, deep: bool = False) -> dict[Hashable, Any]:
83 # do the call to super first so that it can do all the other forms of
84 # checking on this node. If you check the uniqueness of keys first
85 # it would save the work that super does in the case of a failure, but
86 # it might fail in the case that the node was the incorrect node due
87 # to a parsing error, and the resulting exception would be difficult to
88 # understand.
89 mapping = super().construct_mapping(node, deep)
90 # Check if there are any duplicate keys
91 all_keys = Counter(key_node.value for key_node, _ in node.value)
92 duplicates = {k for k, i in all_keys.items() if i != 1}
93 if duplicates:
94 raise KeyError(
95 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times"
96 )
97 return mapping
100class MultilineStringDumper(yaml.Dumper):
101 """Custom YAML dumper that makes multi-line strings use the '|'
102 continuation style instead of unreadable newlines and tons of quotes.
104 Basic approach is taken from
105 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data,
106 but is written as a Dumper subclass to make its effects non-global (vs
107 `yaml.add_representer`).
108 """
110 def represent_scalar(self, tag: str, value: Any, style: str | None = None) -> yaml.ScalarNode:
111 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1:
112 style = "|"
113 return super().represent_scalar(tag, value, style)
116class ContractError(Exception):
117 """An exception that is raised when a pipeline contract is not
118 satisfied.
119 """
121 pass
124@dataclass
125class ContractIR:
126 """Intermediate representation of configuration contracts read from a
127 pipeline yaml file.
128 """
130 contract: str
131 """A string of python code representing one or more conditions on configs
132 in a pipeline. This code-as-string should, once evaluated, should be True
133 if the configs are fine, and False otherwise.
134 """
135 msg: str | None = None
136 """An optional message to be shown to the user if a contract fails
137 """
139 def to_primitives(self) -> dict[str, str]:
140 """Convert to a representation used in yaml serialization."""
141 accumulate = {"contract": self.contract}
142 if self.msg is not None:
143 accumulate["msg"] = self.msg
144 return accumulate
146 def __eq__(self, other: object) -> bool:
147 if not isinstance(other, ContractIR):
148 return False
149 return self.contract == other.contract and self.msg == other.msg
152@dataclass
153class LabeledSubset:
154 """Intermediate representation of named subset of task labels read from
155 a pipeline yaml file.
156 """
158 label: str
159 """The label used to identify the subset of task labels.
160 """
161 subset: set[str]
162 """A set of task labels contained in this subset.
163 """
164 description: str | None
165 """A description of what this subset of tasks is intended to do
166 """
168 @staticmethod
169 def from_primitives(label: str, value: list[str] | dict) -> LabeledSubset:
170 """Generate `LabeledSubset` objects given a properly formatted object
171 that as been created by a yaml loader.
173 Parameters
174 ----------
175 label : `str`
176 The label that will be used to identify this labeled subset.
177 value : `list` of `str` or `dict`
178 Object returned from loading a labeled subset section from a yaml
179 document.
181 Returns
182 -------
183 labeledSubset : `LabeledSubset`
184 A `LabeledSubset` object build from the inputs.
186 Raises
187 ------
188 ValueError
189 Raised if the value input is not properly formatted for parsing
190 """
191 if isinstance(value, MutableMapping):
192 subset = value.pop("subset", None)
193 if subset is None:
194 raise ValueError(
195 "If a labeled subset is specified as a mapping, it must contain the key 'subset'"
196 )
197 description = value.pop("description", None)
198 elif isinstance(value, Iterable):
199 subset = value
200 description = None
201 else:
202 raise ValueError(
203 f"There was a problem parsing the labeled subset {label}, make sure the "
204 "definition is either a valid yaml list, or a mapping with keys "
205 "(subset, description) where subset points to a yaml list, and description is "
206 "associated with a string"
207 )
208 return LabeledSubset(label, set(subset), description)
210 def to_primitives(self) -> dict[str, list[str] | str]:
211 """Convert to a representation used in yaml serialization."""
212 accumulate: dict[str, list[str] | str] = {"subset": list(self.subset)}
213 if self.description is not None:
214 accumulate["description"] = self.description
215 return accumulate
218@dataclass
219class ParametersIR:
220 """Intermediate representation of parameters that are global to a pipeline.
222 Attributes
223 ----------
224 mapping : `dict` [`str`, `str`]
225 A mutable mapping of identifiers as keys, and shared configuration
226 as values.
228 Notes
229 -----
230 These parameters are specified under a top level key named ``parameters``
231 and are declared as a yaml mapping. These entries can then be used inside
232 task configuration blocks to specify configuration values. They may not be
233 used in the special ``file`` or ``python`` blocks.
235 Examples
236 --------
237 .. code-block:: yaml
239 \u200bparameters:
240 shared_value: 14
241 tasks:
242 taskA:
243 class: modA
244 config:
245 field1: parameters.shared_value
246 taskB:
247 class: modB
248 config:
249 field2: parameters.shared_value
250 """
252 mapping: MutableMapping[str, Any]
253 """A mutable mapping of identifiers as keys, and shared configuration
254 as values.
255 """
257 def update(self, other: ParametersIR | None) -> None:
258 if other is not None:
259 self.mapping.update(other.mapping)
261 def to_primitives(self) -> MutableMapping[str, str]:
262 """Convert to a representation used in yaml serialization."""
263 return self.mapping
265 def __contains__(self, value: str) -> bool:
266 return value in self.mapping
268 def __getitem__(self, item: str) -> Any:
269 return self.mapping[item]
271 def __bool__(self) -> bool:
272 return bool(self.mapping)
275@dataclass
276class ConfigIR:
277 """Intermediate representation of configurations read from a pipeline yaml
278 file.
279 """
281 python: str | None = None
282 """A string of python code that is used to modify a configuration. This can
283 also be None if there are no modifications to do.
284 """
285 dataId: dict | None = None
286 """A dataId that is used to constrain these config overrides to only quanta
287 with matching dataIds. This field can be None if there is no constraint.
288 This is currently an unimplemented feature, and is placed here for future
289 use.
290 """
291 file: list[str] = field(default_factory=list)
292 """A list of paths which points to a file containing config overrides to be
293 applied. This value may be an empty list if there are no overrides to
294 apply.
295 """
296 rest: dict = field(default_factory=dict)
297 """This is a dictionary of key value pairs, where the keys are strings
298 corresponding to qualified fields on a config to override, and the values
299 are strings representing the values to apply.
300 """
302 def to_primitives(self) -> dict[str, str | dict | list[str]]:
303 """Convert to a representation used in yaml serialization."""
304 accumulate = {}
305 for name in ("python", "dataId", "file"):
306 # if this attribute is thruthy add it to the accumulation
307 # dictionary
308 if getattr(self, name):
309 accumulate[name] = getattr(self, name)
310 # Add the dictionary containing the rest of the config keys to the
311 # # accumulated dictionary
312 accumulate.update(self.rest)
313 return accumulate
315 def formatted(self, parameters: ParametersIR) -> ConfigIR:
316 """Return a new ConfigIR object that is formatted according to the
317 specified parameters.
319 Parameters
320 ----------
321 parameters : `ParametersIR`
322 Object that contains variable mappings used in substitution.
324 Returns
325 -------
326 config : `ConfigIR`
327 A new ConfigIR object formatted with the input parameters.
328 """
329 new_config = copy.deepcopy(self)
330 for key, value in new_config.rest.items():
331 if not isinstance(value, str):
332 continue
333 match = re.match("parameters[.](.*)", value)
334 if match and match.group(1) in parameters:
335 new_config.rest[key] = parameters[match.group(1)]
336 if match and match.group(1) not in parameters:
337 warnings.warn(
338 f"config {key} contains value {match.group(0)} which is formatted like a "
339 "Pipeline parameter but was not found within the Pipeline, if this was not "
340 "intentional, check for a typo",
341 stacklevel=find_outside_stacklevel("lsst.pipe.base"),
342 )
343 return new_config
345 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]:
346 """Merge another instance of a `ConfigIR` into this instance if
347 possible. This function returns a generator that is either self
348 if the configs were merged, or self, and other_config if that could
349 not be merged.
351 Parameters
352 ----------
353 other_config : `ConfigIR`
354 An instance of `ConfigIR` to merge into this instance.
356 Yields
357 ------
358 Generator : `ConfigIR`
359 A generator containing either self, or self and other_config if
360 the configs could be merged or not respectively.
361 """
362 # Verify that the config blocks can be merged
363 if (
364 self.dataId != other_config.dataId
365 or self.python
366 or other_config.python
367 or self.file
368 or other_config.file
369 ):
370 yield from (self, other_config)
371 return
373 # create a set of all keys, and verify two keys do not have different
374 # values
375 key_union = self.rest.keys() & other_config.rest.keys()
376 for key in key_union:
377 if self.rest[key] != other_config.rest[key]:
378 yield from (self, other_config)
379 return
380 self.rest.update(other_config.rest)
382 # Combine the lists of override files to load
383 self_file_set = set(self.file)
384 other_file_set = set(other_config.file)
385 self.file = list(self_file_set.union(other_file_set))
387 yield self
389 def __eq__(self, other: object) -> bool:
390 if not isinstance(other, ConfigIR):
391 return False
392 return all(
393 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest")
394 )
397@dataclass
398class TaskIR:
399 """Intermediate representation of tasks read from a pipeline yaml file."""
401 label: str
402 """An identifier used to refer to a task.
403 """
404 klass: str
405 """A string containing a fully qualified python class to be run in a
406 pipeline.
407 """
408 config: list[ConfigIR] | None = None
409 """list of all configs overrides associated with this task, and may be
410 `None` if there are no config overrides.
411 """
413 def to_primitives(self) -> dict[str, str | list[dict]]:
414 """Convert to a representation used in yaml serialization."""
415 accumulate: dict[str, str | list[dict]] = {"class": self.klass}
416 if self.config:
417 accumulate["config"] = [c.to_primitives() for c in self.config]
418 return accumulate
420 def add_or_update_config(self, other_config: ConfigIR) -> None:
421 """Add a `ConfigIR` to this task if one is not present. Merges configs
422 if there is a `ConfigIR` present and the dataId keys of both configs
423 match, otherwise adds a new entry to the config list. The exception to
424 the above is that if either the last config or other_config has a
425 python block, then other_config is always added, as python blocks can
426 modify configs in ways that cannot be predicted.
428 Parameters
429 ----------
430 other_config : `ConfigIR`
431 A `ConfigIR` instance to add or merge into the config attribute of
432 this task.
433 """
434 if not self.config:
435 self.config = [other_config]
436 return
437 self.config.extend(self.config.pop().maybe_merge(other_config))
439 def __eq__(self, other: object) -> bool:
440 if not isinstance(other, TaskIR):
441 return False
442 return all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config"))
445@dataclass
446class ImportIR:
447 """An intermediate representation of imported pipelines."""
449 location: str
450 """This is the location of the pipeline to inherit. The path should be
451 specified as an absolute path. Environment variables may be used in the
452 path and should be specified as a python string template, with the name of
453 the environment variable inside braces.
454 """
455 include: list[str] | None = None
456 """list of tasks that should be included when inheriting this pipeline.
457 Either the include or exclude attributes may be specified, but not both.
458 """
459 exclude: list[str] | None = None
460 """list of tasks that should be excluded when inheriting this pipeline.
461 Either the include or exclude attributes may be specified, but not both.
462 """
463 importContracts: bool = True
464 """Boolean attribute to dictate if contracts should be inherited with the
465 pipeline or not.
466 """
467 importSteps: bool = True
468 """Boolean attribute to dictate if steps should be inherited with the
469 pipeline or not.
470 """
471 labeledSubsetModifyMode: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP
472 """Controls how labeled subsets are handled when an import ends up not
473 including (either through an include or exclusion list) a task label that
474 is defined in the `Pipeline` being imported. DROP will remove any
475 subsets which contain a missing label. EDIT will change any subsets to not
476 include the missing label.
477 """
478 instrument: Literal[_Tags.KeepInstrument] | str | None = _Tags.KeepInstrument
479 """Instrument to assign to the Pipeline at import. The default value of
480 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is
481 declared with will not be modified. setting this value to None will drop
482 any declared instrument prior to import.
483 """
485 def toPipelineIR(self) -> "PipelineIR":
486 """Load in the Pipeline specified by this object, and turn it into a
487 PipelineIR instance.
489 Returns
490 -------
491 pipeline : `PipelineIR`
492 A pipeline generated from the imported pipeline file.
493 """
494 if self.include and self.exclude:
495 raise ValueError(
496 "An include list and an exclude list cannot both be specified"
497 " when declaring a pipeline import."
498 )
499 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location))
500 if self.instrument is not _Tags.KeepInstrument:
501 tmp_pipeline.instrument = self.instrument
503 included_labels = set()
504 for label in tmp_pipeline.tasks:
505 if (
506 (self.include and label in self.include)
507 or (self.exclude and label not in self.exclude)
508 or (self.include is None and self.exclude is None)
509 ):
510 included_labels.add(label)
512 # Handle labeled subsets being specified in the include or exclude
513 # list, adding or removing labels.
514 if self.include is not None:
515 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include
516 for label in subsets_in_include:
517 included_labels.update(tmp_pipeline.labeled_subsets[label].subset)
519 elif self.exclude is not None:
520 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude
521 for label in subsets_in_exclude:
522 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset)
524 if not self.importSteps:
525 tmp_pipeline.steps = []
527 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels, self.labeledSubsetModifyMode)
529 if not self.importContracts:
530 tmp_pipeline.contracts = []
532 return tmp_pipeline
534 def __eq__(self, other: object) -> bool:
535 if not isinstance(other, ImportIR):
536 return False
537 return all(
538 getattr(self, attr) == getattr(other, attr)
539 for attr in ("location", "include", "exclude", "importContracts")
540 )
543@dataclass
544class StepIR:
545 """Intermediate representation of a step definition."""
547 label: str
548 """The label associated with this step."""
549 sharding_dimensions: list[str]
550 """The dimensions to use when sharding this step."""
553class PipelineIR:
554 """Intermediate representation of a pipeline definition.
556 Parameters
557 ----------
558 loaded_yaml : `dict`
559 A dictionary which matches the structure that would be produced by a
560 yaml reader which parses a pipeline definition document.
562 Raises
563 ------
564 ValueError
565 Raised if:
567 - a pipeline is declared without a description;
568 - no tasks are declared in a pipeline, and no pipelines are to be
569 inherited;
570 - more than one instrument is specified;
571 - more than one inherited pipeline share a label.
572 """
574 def __init__(self, loaded_yaml: dict[str, Any]):
575 # Check required fields are present
576 if "description" not in loaded_yaml:
577 raise ValueError("A pipeline must be declared with a description")
578 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2:
579 raise ValueError("A pipeline must be declared with one or more tasks")
581 # These steps below must happen in this call order
583 # Process pipeline description
584 self.description = loaded_yaml.pop("description")
586 # Process tasks
587 self._read_tasks(loaded_yaml)
589 # Process instrument keys
590 inst = loaded_yaml.pop("instrument", None)
591 if isinstance(inst, list):
592 raise ValueError("Only one top level instrument can be defined in a pipeline")
593 self.instrument: str | None = inst
595 # Process any contracts
596 self._read_contracts(loaded_yaml)
598 # Process any defined parameters
599 self._read_parameters(loaded_yaml)
601 # Process any named label subsets
602 self._read_labeled_subsets(loaded_yaml)
604 # Process defined sets
605 self._read_step_declaration(loaded_yaml)
607 # Process any inherited pipelines
608 self._read_imports(loaded_yaml)
610 # verify named subsets, must be done after inheriting
611 self._verify_labeled_subsets()
613 # verify steps, must be done after inheriting
614 self._verify_steps()
616 def _read_contracts(self, loaded_yaml: dict[str, Any]) -> None:
617 """Process the contracts portion of the loaded yaml document
619 Parameters
620 ----------
621 loaded_yaml : `dict`
622 A dictionary which matches the structure that would be produced by
623 a yaml reader which parses a pipeline definition document
624 """
625 loaded_contracts = loaded_yaml.pop("contracts", [])
626 if isinstance(loaded_contracts, str):
627 loaded_contracts = [loaded_contracts]
628 self.contracts: list[ContractIR] = []
629 for contract in loaded_contracts:
630 if isinstance(contract, dict):
631 self.contracts.append(ContractIR(**contract))
632 if isinstance(contract, str):
633 self.contracts.append(ContractIR(contract=contract))
635 def _read_parameters(self, loaded_yaml: dict[str, Any]) -> None:
636 """Process the parameters portion of the loaded yaml document
638 Parameters
639 ----------
640 loaded_yaml : `dict`
641 A dictionary which matches the structure that would be produced by
642 a yaml reader which parses a pipeline definition document
643 """
644 loaded_parameters = loaded_yaml.pop("parameters", {})
645 if not isinstance(loaded_parameters, dict):
646 raise ValueError("The parameters section must be a yaml mapping")
647 self.parameters = ParametersIR(loaded_parameters)
649 def _read_labeled_subsets(self, loaded_yaml: dict[str, Any]) -> None:
650 """Process the subsets portion of the loaded yaml document
652 Parameters
653 ----------
654 loaded_yaml : `MutableMapping`
655 A dictionary which matches the structure that would be produced
656 by a yaml reader which parses a pipeline definition document
657 """
658 loaded_subsets = loaded_yaml.pop("subsets", {})
659 self.labeled_subsets: dict[str, LabeledSubset] = {}
660 if not loaded_subsets and "subset" in loaded_yaml:
661 raise ValueError("Top level key should be subsets and not subset, add an s")
662 for key, value in loaded_subsets.items():
663 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value)
665 def _read_step_declaration(self, loaded_yaml: dict[str, Any]) -> None:
666 """Process the steps portion of the loaded yaml document
668 Steps are subsets that are declared to be normal parts of the overall
669 processing of the pipeline. Not all subsets need to be a step, as they
670 can exist for certain targeted processing, such as debugging.
672 Parameters
673 ----------
674 loaded_yaml: `dict`
675 A dictionary which matches the structure that would be produced
676 by a yaml reader which parses a pipeline definition document
677 """
678 loaded_steps = loaded_yaml.pop("steps", [])
679 temp_steps: dict[str, StepIR] = {}
680 for declaration in loaded_steps:
681 new_step = StepIR(**declaration)
682 existing = temp_steps.setdefault(new_step.label, new_step)
683 if existing is not new_step:
684 raise ValueError(f"Step {existing.label} was declared twice.")
685 self.steps = [step for step in temp_steps.values()]
687 def _verify_labeled_subsets(self) -> None:
688 """Verify that all the labels in each named subset exist within the
689 pipeline.
690 """
691 # Verify that all labels defined in a labeled subset are in the
692 # Pipeline
693 for labeled_subset in self.labeled_subsets.values():
694 if not labeled_subset.subset.issubset(self.tasks.keys()):
695 raise ValueError(
696 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the "
697 "declared pipeline"
698 )
699 # Verify subset labels are not already task labels
700 label_intersection = self.labeled_subsets.keys() & self.tasks.keys()
701 if label_intersection:
702 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}")
704 def _verify_steps(self) -> None:
705 """Verify that all step definitions have a corresponding labeled
706 subset.
707 """
708 for step in self.steps:
709 if step.label not in self.labeled_subsets:
710 raise ValueError(
711 f"{step.label} was declared to be a step, but was not declared to be a labeled subset"
712 )
714 def _read_imports(self, loaded_yaml: dict[str, Any]) -> None:
715 """Process the inherits portion of the loaded yaml document
717 Parameters
718 ----------
719 loaded_yaml : `dict`
720 A dictionary which matches the structure that would be produced by
721 a yaml reader which parses a pipeline definition document
722 """
724 def process_args(argument: str | dict) -> dict:
725 if isinstance(argument, str):
726 return {"location": argument}
727 elif isinstance(argument, dict):
728 if "exclude" in argument and isinstance(argument["exclude"], str):
729 argument["exclude"] = [argument["exclude"]]
730 if "include" in argument and isinstance(argument["include"], str):
731 argument["include"] = [argument["include"]]
732 if "instrument" in argument and argument["instrument"] == "None":
733 argument["instrument"] = None
734 if "labeledSubsetModifyMode" in argument:
735 match argument["labeledSubsetModifyMode"]:
736 case "DROP":
737 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.DROP
738 case "EDIT":
739 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.EDIT
740 case unknown:
741 raise ValueError(f"{unknown} is not a valid mode for labeledSubsetModifyMode")
742 return argument
744 if not {"inherits", "imports"} - loaded_yaml.keys():
745 raise ValueError("Cannot define both inherits and imports sections, use imports")
746 tmp_import = loaded_yaml.pop("inherits", None)
747 if tmp_import is None:
748 tmp_import = loaded_yaml.pop("imports", None)
749 else:
750 raise ValueError("The 'inherits' key is not supported. Please use the key 'imports' instead")
751 if tmp_import is None:
752 self.imports: list[ImportIR] = []
753 elif isinstance(tmp_import, list):
754 self.imports = [ImportIR(**process_args(args)) for args in tmp_import]
755 else:
756 self.imports = [ImportIR(**process_args(tmp_import))]
758 self.merge_pipelines([fragment.toPipelineIR() for fragment in self.imports])
760 def merge_pipelines(self, pipelines: Iterable[PipelineIR]) -> None:
761 """Merge one or more other `PipelineIR` objects into this object.
763 Parameters
764 ----------
765 pipelines : `~collections.abc.Iterable` of `PipelineIR` objects
766 An `~collections.abc.Iterable` that contains one or more
767 `PipelineIR` objects to merge into this object.
769 Raises
770 ------
771 ValueError
772 Raised if there is a conflict in instrument specifications.
773 Raised if a task label appears in more than one of the input
774 `PipelineIR` objects which are to be merged.
775 Raised if a labeled subset appears in more than one of the input
776 `PipelineIR` objects which are to be merged, and with any subset
777 existing in this object.
778 """
779 # integrate any imported pipelines
780 accumulate_tasks: dict[str, TaskIR] = {}
781 accumulate_labeled_subsets: dict[str, LabeledSubset] = {}
782 accumulated_parameters = ParametersIR({})
783 accumulated_steps: dict[str, StepIR] = {}
785 for tmp_IR in pipelines:
786 if self.instrument is None:
787 self.instrument = tmp_IR.instrument
788 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None:
789 msg = (
790 "Only one instrument can be declared in a pipeline or its imports. "
791 f"Top level pipeline defines {self.instrument} but pipeline to merge "
792 f"defines {tmp_IR.instrument}."
793 )
794 raise ValueError(msg)
795 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys():
796 msg = (
797 "Task labels in the imported pipelines must be unique. "
798 f"These labels appear multiple times: {duplicate_labels}"
799 )
800 raise ValueError(msg)
801 accumulate_tasks.update(tmp_IR.tasks)
802 self.contracts.extend(tmp_IR.contracts)
803 # verify that tmp_IR has unique labels for named subset among
804 # existing labeled subsets, and with existing task labels.
805 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys()
806 task_subset_overlap = (
807 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys()
808 ) & accumulate_tasks.keys()
809 if overlapping_subsets or task_subset_overlap:
810 raise ValueError(
811 "Labeled subset names must be unique amongst imports in both labels and "
812 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}"
813 )
814 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets)
815 accumulated_parameters.update(tmp_IR.parameters)
816 for tmp_step in tmp_IR.steps:
817 existing = accumulated_steps.setdefault(tmp_step.label, tmp_step)
818 if existing != tmp_step:
819 raise ValueError(
820 f"There were conflicting step definitions in import {tmp_step}, {existing}"
821 )
823 for tmp_step in self.steps:
824 existing = accumulated_steps.setdefault(tmp_step.label, tmp_step)
825 if existing != tmp_step:
826 raise ValueError(f"There were conflicting step definitions in import {tmp_step}, {existing}")
828 # verify that any accumulated labeled subsets dont clash with a label
829 # from this pipeline
830 if accumulate_labeled_subsets.keys() & self.tasks.keys():
831 raise ValueError(
832 "Labeled subset names must be unique amongst imports in both labels and named Subsets"
833 )
834 # merge in the named subsets for self so this document can override any
835 # that have been delcared
836 accumulate_labeled_subsets.update(self.labeled_subsets)
837 self.labeled_subsets = accumulate_labeled_subsets
839 # merge the dict of label:TaskIR objects, preserving any configs in the
840 # imported pipeline if the labels point to the same class
841 for label, task in self.tasks.items():
842 if label not in accumulate_tasks:
843 accumulate_tasks[label] = task
844 elif accumulate_tasks[label].klass == task.klass:
845 if task.config is not None:
846 for config in task.config:
847 accumulate_tasks[label].add_or_update_config(config)
848 else:
849 accumulate_tasks[label] = task
850 self.tasks: dict[str, TaskIR] = accumulate_tasks
851 accumulated_parameters.update(self.parameters)
852 self.parameters = accumulated_parameters
853 self.steps = list(accumulated_steps.values())
855 def _read_tasks(self, loaded_yaml: dict[str, Any]) -> None:
856 """Process the tasks portion of the loaded yaml document
858 Parameters
859 ----------
860 loaded_yaml : `dict`
861 A dictionary which matches the structure that would be produced by
862 a yaml reader which parses a pipeline definition document
863 """
864 self.tasks = {}
865 tmp_tasks = loaded_yaml.pop("tasks", None)
866 if tmp_tasks is None:
867 tmp_tasks = {}
869 if "parameters" in tmp_tasks:
870 raise ValueError("parameters is a reserved word and cannot be used as a task label")
872 for label, definition in tmp_tasks.items():
873 if isinstance(definition, str):
874 definition = {"class": definition}
875 config = definition.get("config", None)
876 if config is None:
877 task_config_ir = None
878 else:
879 if isinstance(config, dict):
880 config = [config]
881 task_config_ir = []
882 for c in config:
883 file = c.pop("file", None)
884 if file is None:
885 file = []
886 elif not isinstance(file, list):
887 file = [file]
888 task_config_ir.append(
889 ConfigIR(
890 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c
891 )
892 )
893 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir)
895 def _remove_contracts(self, label: str) -> None:
896 """Remove any contracts that contain the given label
898 String comparison used in this way is not the most elegant and may
899 have issues, but it is the only feasible way when users can specify
900 contracts with generic strings.
901 """
902 new_contracts = []
903 for contract in self.contracts:
904 # match a label that is not preceded by an ASCII identifier, or
905 # is the start of a line and is followed by a dot
906 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract):
907 continue
908 new_contracts.append(contract)
909 self.contracts = new_contracts
911 def subset_from_labels(
912 self, labelSpecifier: set[str], subsetCtrl: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP
913 ) -> PipelineIR:
914 """Subset a pipelineIR to contain only labels specified in
915 labelSpecifier.
917 Parameters
918 ----------
919 labelSpecifier : `set` of `str`
920 Set containing labels that describes how to subset a pipeline.
921 subsetCtrl : `PipelineSubsetCtrl`
922 Control object which decides how subsets with missing labels are
923 handled. Setting to `PipelineSubsetCtrl.DROP` (the default) will
924 cause any subsets that have labels which are not in the set of all
925 task labels to be dropped. Setting to `PipelineSubsetCtrl.EDIT`
926 will cause the subset to instead be edited to remove the
927 nonexistent label.
929 Returns
930 -------
931 pipeline : `PipelineIR`
932 A new pipelineIR object that is a subset of the old pipelineIR.
934 Raises
935 ------
936 ValueError
937 Raised if there is an issue with specified labels.
939 Notes
940 -----
941 This method attempts to prune any contracts that contain labels which
942 are not in the declared subset of labels. This pruning is done using a
943 string based matching due to the nature of contracts and may prune more
944 than it should.
945 """
946 pipeline = copy.deepcopy(self)
948 # update the label specifier to expand any named subsets
949 toRemove = set()
950 toAdd = set()
951 for label in labelSpecifier:
952 if label in pipeline.labeled_subsets:
953 toRemove.add(label)
954 toAdd.update(pipeline.labeled_subsets[label].subset)
955 labelSpecifier.difference_update(toRemove)
956 labelSpecifier.update(toAdd)
957 # verify all the labels are in the pipeline
958 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets):
959 difference = labelSpecifier.difference(pipeline.tasks.keys())
960 raise ValueError(
961 "Not all supplied labels (specified or named subsets) are in the pipeline "
962 f"definition, extra labels: {difference}"
963 )
964 # copy needed so as to not modify while iterating
965 pipeline_labels = set(pipeline.tasks.keys())
966 # Remove the labels from the pipelineIR, and any contracts that contain
967 # those labels (see docstring on _remove_contracts for why this may
968 # cause issues)
969 for label in pipeline_labels:
970 if label not in labelSpecifier:
971 pipeline.tasks.pop(label)
972 pipeline._remove_contracts(label)
974 # create a copy of the object to iterate over
975 labeled_subsets = copy.copy(pipeline.labeled_subsets)
976 # remove or edit any labeled subsets that no longer have a complete set
977 for label, labeled_subset in labeled_subsets.items():
978 if extraTaskLabels := (labeled_subset.subset - pipeline.tasks.keys()):
979 match subsetCtrl:
980 case PipelineSubsetCtrl.DROP:
981 pipeline.labeled_subsets.pop(label)
982 case PipelineSubsetCtrl.EDIT:
983 for extra in extraTaskLabels:
984 labeled_subset.subset.discard(extra)
986 # remove any steps that correspond to removed subsets
987 new_steps = []
988 for step in pipeline.steps:
989 if step.label not in pipeline.labeled_subsets:
990 continue
991 new_steps.append(step)
992 pipeline.steps = new_steps
994 return pipeline
996 @classmethod
997 def from_string(cls, pipeline_string: str) -> PipelineIR:
998 """Create a `PipelineIR` object from a string formatted like a pipeline
999 document.
1001 Parameters
1002 ----------
1003 pipeline_string : `str`
1004 A string that is formatted according like a pipeline document.
1005 """
1006 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader)
1007 return cls(loaded_yaml)
1009 @classmethod
1010 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR:
1011 """Create a `PipelineIR` object from the document specified by the
1012 input uri.
1014 Parameters
1015 ----------
1016 uri : convertible to `~lsst.resources.ResourcePath`
1017 Location of document to use in creating a `PipelineIR` object.
1019 Returns
1020 -------
1021 pipelineIR : `PipelineIR`
1022 The loaded pipeline.
1023 """
1024 loaded_uri = ResourcePath(uri)
1025 with loaded_uri.open("r") as buffer:
1026 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader)
1027 return cls(loaded_yaml)
1029 def write_to_uri(self, uri: ResourcePathExpression) -> None:
1030 """Serialize this `PipelineIR` object into a yaml formatted string and
1031 write the output to a file at the specified uri.
1033 Parameters
1034 ----------
1035 uri : convertible to `~lsst.resources.ResourcePath`
1036 Location of document to write a `PipelineIR` object.
1037 """
1038 with ResourcePath(uri).open("w") as buffer:
1039 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper)
1041 def to_primitives(self) -> dict[str, Any]:
1042 """Convert to a representation used in yaml serialization.
1044 Returns
1045 -------
1046 primitives : `dict`
1047 Dictionary that maps directly to the serialized YAML form.
1048 """
1049 accumulate = {"description": self.description}
1050 if self.instrument is not None:
1051 accumulate["instrument"] = self.instrument
1052 if self.parameters:
1053 accumulate["parameters"] = self.parameters.to_primitives()
1054 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()}
1055 if len(self.contracts) > 0:
1056 # sort contracts lexicographical order by the contract string in
1057 # absence of any other ordering principle
1058 contracts_list = [c.to_primitives() for c in self.contracts]
1059 contracts_list.sort(key=lambda x: x["contract"])
1060 accumulate["contracts"] = contracts_list
1061 if self.labeled_subsets:
1062 accumulate["subsets"] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()}
1063 return accumulate
1065 def __str__(self) -> str:
1066 """Instance formatting as how it would look in yaml representation"""
1067 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper)
1069 def __repr__(self) -> str:
1070 """Instance formatting as how it would look in yaml representation"""
1071 return str(self)
1073 def __eq__(self, other: object) -> bool:
1074 if not isinstance(other, PipelineIR):
1075 return False
1076 # special case contracts because it is a list, but order is not
1077 # important
1078 return (
1079 all(
1080 getattr(self, attr) == getattr(other, attr)
1081 for attr in ("tasks", "instrument", "labeled_subsets", "parameters")
1082 )
1083 and len(self.contracts) == len(other.contracts)
1084 and all(c in self.contracts for c in other.contracts)
1085 )