Coverage for python/lsst/pipe/base/pipelineIR.py: 21%
407 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-11 17:45 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-11 17:45 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = (
30 "ConfigIR",
31 "ContractError",
32 "ContractIR",
33 "ImportIR",
34 "LabeledSubset",
35 "ParametersIR",
36 "PipelineIR",
37 "TaskIR",
38)
40import copy
41import enum
42import os
43import re
44import warnings
45from collections import Counter
46from collections.abc import Generator, Hashable, Iterable, MutableMapping
47from dataclasses import dataclass, field
48from typing import Any, Literal
50import yaml
51from lsst.resources import ResourcePath, ResourcePathExpression
52from lsst.utils.introspection import find_outside_stacklevel
55class PipelineSubsetCtrl(enum.Enum):
56 """An Enumeration of the various ways a pipeline subsetting operation will
57 handle labeled subsets when task labels they defined are missing.
58 """
60 DROP = enum.auto()
61 """Drop any subsets that contain labels which are no longer in the set of
62 task labels when subsetting an entire pipeline
63 """
64 EDIT = enum.auto()
65 """Edit any subsets that contain labels which are no longer in the set of
66 task labels to remove the missing label, but leave the subset when
67 subsetting a pipeline.
68 """
71class _Tags(enum.Enum):
72 KeepInstrument = enum.auto()
75class PipelineYamlLoader(yaml.SafeLoader):
76 """Specialized version of yaml's SafeLoader.
78 It checks and raises an exception if it finds that there are multiple
79 instances of the same key found inside a pipeline file at a given scope.
80 """
82 def construct_mapping(self, node: yaml.MappingNode, deep: bool = False) -> dict[Hashable, Any]:
83 # do the call to super first so that it can do all the other forms of
84 # checking on this node. If you check the uniqueness of keys first
85 # it would save the work that super does in the case of a failure, but
86 # it might fail in the case that the node was the incorrect node due
87 # to a parsing error, and the resulting exception would be difficult to
88 # understand.
89 mapping = super().construct_mapping(node, deep)
90 # Check if there are any duplicate keys
91 all_keys = Counter(key_node.value for key_node, _ in node.value)
92 duplicates = {k for k, i in all_keys.items() if i != 1}
93 if duplicates:
94 raise KeyError(
95 f"Pipeline files must not have duplicated keys, {duplicates} appeared multiple times"
96 )
97 return mapping
100class MultilineStringDumper(yaml.Dumper):
101 """Custom YAML dumper that makes multi-line strings use the '|'
102 continuation style instead of unreadable newlines and tons of quotes.
104 Basic approach is taken from
105 https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data,
106 but is written as a Dumper subclass to make its effects non-global (vs
107 `yaml.add_representer`).
108 """
110 def represent_scalar(self, tag: str, value: Any, style: str | None = None) -> yaml.ScalarNode:
111 if style is None and tag == "tag:yaml.org,2002:str" and len(value.splitlines()) > 1:
112 style = "|"
113 return super().represent_scalar(tag, value, style)
116class ContractError(Exception):
117 """An exception that is raised when a pipeline contract is not
118 satisfied.
119 """
121 pass
124@dataclass
125class ContractIR:
126 """Intermediate representation of configuration contracts read from a
127 pipeline yaml file.
128 """
130 contract: str
131 """A string of python code representing one or more conditions on configs
132 in a pipeline. This code-as-string should, once evaluated, should be True
133 if the configs are fine, and False otherwise.
134 """
135 msg: str | None = None
136 """An optional message to be shown to the user if a contract fails
137 """
139 def to_primitives(self) -> dict[str, str]:
140 """Convert to a representation used in yaml serialization."""
141 accumulate = {"contract": self.contract}
142 if self.msg is not None:
143 accumulate["msg"] = self.msg
144 return accumulate
146 def __eq__(self, other: object) -> bool:
147 if not isinstance(other, ContractIR):
148 return False
149 return self.contract == other.contract and self.msg == other.msg
152@dataclass
153class LabeledSubset:
154 """Intermediate representation of named subset of task labels read from
155 a pipeline yaml file.
156 """
158 label: str
159 """The label used to identify the subset of task labels.
160 """
161 subset: set[str]
162 """A set of task labels contained in this subset.
163 """
164 description: str | None
165 """A description of what this subset of tasks is intended to do
166 """
168 @staticmethod
169 def from_primitives(label: str, value: list[str] | dict) -> LabeledSubset:
170 """Generate `LabeledSubset` objects given a properly formatted object
171 that as been created by a yaml loader.
173 Parameters
174 ----------
175 label : `str`
176 The label that will be used to identify this labeled subset.
177 value : `list` of `str` or `dict`
178 Object returned from loading a labeled subset section from a yaml
179 document.
181 Returns
182 -------
183 labeledSubset : `LabeledSubset`
184 A `LabeledSubset` object build from the inputs.
186 Raises
187 ------
188 ValueError
189 Raised if the value input is not properly formatted for parsing
190 """
191 if isinstance(value, MutableMapping):
192 subset = value.pop("subset", None)
193 if subset is None:
194 raise ValueError(
195 "If a labeled subset is specified as a mapping, it must contain the key 'subset'"
196 )
197 description = value.pop("description", None)
198 elif isinstance(value, Iterable):
199 subset = value
200 description = None
201 else:
202 raise ValueError(
203 f"There was a problem parsing the labeled subset {label}, make sure the "
204 "definition is either a valid yaml list, or a mapping with keys "
205 "(subset, description) where subset points to a yaml list, and description is "
206 "associated with a string"
207 )
208 return LabeledSubset(label, set(subset), description)
210 def to_primitives(self) -> dict[str, list[str] | str]:
211 """Convert to a representation used in yaml serialization."""
212 accumulate: dict[str, list[str] | str] = {"subset": list(self.subset)}
213 if self.description is not None:
214 accumulate["description"] = self.description
215 return accumulate
218@dataclass
219class ParametersIR:
220 """Intermediate representation of parameters that are global to a pipeline.
222 Attributes
223 ----------
224 mapping : `dict` [`str`, `str`]
225 A mutable mapping of identifiers as keys, and shared configuration
226 as values.
228 Notes
229 -----
230 These parameters are specified under a top level key named ``parameters``
231 and are declared as a yaml mapping. These entries can then be used inside
232 task configuration blocks to specify configuration values. They may not be
233 used in the special ``file`` or ``python`` blocks.
235 Examples
236 --------
237 .. code-block:: yaml
239 \u200bparameters:
240 shared_value: 14
241 tasks:
242 taskA:
243 class: modA
244 config:
245 field1: parameters.shared_value
246 taskB:
247 class: modB
248 config:
249 field2: parameters.shared_value
250 """
252 mapping: MutableMapping[str, Any]
253 """A mutable mapping of identifiers as keys, and shared configuration
254 as values.
255 """
257 def update(self, other: ParametersIR | None) -> None:
258 if other is not None:
259 self.mapping.update(other.mapping)
261 def to_primitives(self) -> MutableMapping[str, str]:
262 """Convert to a representation used in yaml serialization."""
263 return self.mapping
265 def __contains__(self, value: str) -> bool:
266 return value in self.mapping
268 def __getitem__(self, item: str) -> Any:
269 return self.mapping[item]
271 def __bool__(self) -> bool:
272 return bool(self.mapping)
275@dataclass
276class ConfigIR:
277 """Intermediate representation of configurations read from a pipeline yaml
278 file.
279 """
281 python: str | None = None
282 """A string of python code that is used to modify a configuration. This can
283 also be None if there are no modifications to do.
284 """
285 dataId: dict | None = None
286 """A dataId that is used to constrain these config overrides to only quanta
287 with matching dataIds. This field can be None if there is no constraint.
288 This is currently an unimplemented feature, and is placed here for future
289 use.
290 """
291 file: list[str] = field(default_factory=list)
292 """A list of paths which points to a file containing config overrides to be
293 applied. This value may be an empty list if there are no overrides to
294 apply.
295 """
296 rest: dict = field(default_factory=dict)
297 """This is a dictionary of key value pairs, where the keys are strings
298 corresponding to qualified fields on a config to override, and the values
299 are strings representing the values to apply.
300 """
302 def to_primitives(self) -> dict[str, str | dict | list[str]]:
303 """Convert to a representation used in yaml serialization."""
304 accumulate = {}
305 for name in ("python", "dataId", "file"):
306 # if this attribute is thruthy add it to the accumulation
307 # dictionary
308 if getattr(self, name):
309 accumulate[name] = getattr(self, name)
310 # Add the dictionary containing the rest of the config keys to the
311 # # accumulated dictionary
312 accumulate.update(self.rest)
313 return accumulate
315 def formatted(self, parameters: ParametersIR) -> ConfigIR:
316 """Return a new ConfigIR object that is formatted according to the
317 specified parameters.
319 Parameters
320 ----------
321 parameters : `ParametersIR`
322 Object that contains variable mappings used in substitution.
324 Returns
325 -------
326 config : `ConfigIR`
327 A new ConfigIR object formatted with the input parameters.
328 """
329 new_config = copy.deepcopy(self)
330 for key, value in new_config.rest.items():
331 if not isinstance(value, str):
332 continue
333 match = re.match("parameters[.](.*)", value)
334 if match and match.group(1) in parameters:
335 new_config.rest[key] = parameters[match.group(1)]
336 if match and match.group(1) not in parameters:
337 warnings.warn(
338 f"config {key} contains value {match.group(0)} which is formatted like a "
339 "Pipeline parameter but was not found within the Pipeline, if this was not "
340 "intentional, check for a typo",
341 stacklevel=find_outside_stacklevel("lsst.pipe.base"),
342 )
343 return new_config
345 def maybe_merge(self, other_config: "ConfigIR") -> Generator["ConfigIR", None, None]:
346 """Merge another instance of a `ConfigIR` into this instance if
347 possible. This function returns a generator that is either self
348 if the configs were merged, or self, and other_config if that could
349 not be merged.
351 Parameters
352 ----------
353 other_config : `ConfigIR`
354 An instance of `ConfigIR` to merge into this instance.
356 Yields
357 ------
358 Generator : `ConfigIR`
359 A generator containing either self, or self and other_config if
360 the configs could be merged or not respectively.
361 """
362 # Verify that the config blocks can be merged
363 if (
364 self.dataId != other_config.dataId
365 or self.python
366 or other_config.python
367 or self.file
368 or other_config.file
369 ):
370 yield from (self, other_config)
371 return
373 # create a set of all keys, and verify two keys do not have different
374 # values
375 key_union = self.rest.keys() & other_config.rest.keys()
376 for key in key_union:
377 if self.rest[key] != other_config.rest[key]:
378 yield from (self, other_config)
379 return
380 self.rest.update(other_config.rest)
382 # Combine the lists of override files to load
383 self_file_set = set(self.file)
384 other_file_set = set(other_config.file)
385 self.file = list(self_file_set.union(other_file_set))
387 yield self
389 def __eq__(self, other: object) -> bool:
390 if not isinstance(other, ConfigIR):
391 return False
392 return all(
393 getattr(self, attr) == getattr(other, attr) for attr in ("python", "dataId", "file", "rest")
394 )
397@dataclass
398class TaskIR:
399 """Intermediate representation of tasks read from a pipeline yaml file."""
401 label: str
402 """An identifier used to refer to a task.
403 """
404 klass: str
405 """A string containing a fully qualified python class to be run in a
406 pipeline.
407 """
408 config: list[ConfigIR] | None = None
409 """list of all configs overrides associated with this task, and may be
410 `None` if there are no config overrides.
411 """
413 def to_primitives(self) -> dict[str, str | list[dict]]:
414 """Convert to a representation used in yaml serialization."""
415 accumulate: dict[str, str | list[dict]] = {"class": self.klass}
416 if self.config:
417 accumulate["config"] = [c.to_primitives() for c in self.config]
418 return accumulate
420 def add_or_update_config(self, other_config: ConfigIR) -> None:
421 """Add a `ConfigIR` to this task if one is not present. Merges configs
422 if there is a `ConfigIR` present and the dataId keys of both configs
423 match, otherwise adds a new entry to the config list. The exception to
424 the above is that if either the last config or other_config has a
425 python block, then other_config is always added, as python blocks can
426 modify configs in ways that cannot be predicted.
428 Parameters
429 ----------
430 other_config : `ConfigIR`
431 A `ConfigIR` instance to add or merge into the config attribute of
432 this task.
433 """
434 if not self.config:
435 self.config = [other_config]
436 return
437 self.config.extend(self.config.pop().maybe_merge(other_config))
439 def __eq__(self, other: object) -> bool:
440 if not isinstance(other, TaskIR):
441 return False
442 return all(getattr(self, attr) == getattr(other, attr) for attr in ("label", "klass", "config"))
445@dataclass
446class ImportIR:
447 """An intermediate representation of imported pipelines."""
449 location: str
450 """This is the location of the pipeline to inherit. The path should be
451 specified as an absolute path. Environment variables may be used in the
452 path and should be specified as a python string template, with the name of
453 the environment variable inside braces.
454 """
455 include: list[str] | None = None
456 """list of tasks that should be included when inheriting this pipeline.
457 Either the include or exclude attributes may be specified, but not both.
458 """
459 exclude: list[str] | None = None
460 """list of tasks that should be excluded when inheriting this pipeline.
461 Either the include or exclude attributes may be specified, but not both.
462 """
463 importContracts: bool = True
464 """Boolean attribute to dictate if contracts should be inherited with the
465 pipeline or not.
466 """
467 labeledSubsetModifyMode: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP
468 """Controls how labeled subsets are handled when an import ends up not
469 including (either through an include or exclusion list) a task label that
470 is defined in the `Pipeline` being imported. DROP will remove any
471 subsets which contain a missing label. EDIT will change any subsets to not
472 include the missing label.
473 """
474 instrument: Literal[_Tags.KeepInstrument] | str | None = _Tags.KeepInstrument
475 """Instrument to assign to the Pipeline at import. The default value of
476 `_Tags.KeepInstrument`` indicates that whatever instrument the pipeline is
477 declared with will not be modified. setting this value to None will drop
478 any declared instrument prior to import.
479 """
481 def toPipelineIR(self) -> "PipelineIR":
482 """Load in the Pipeline specified by this object, and turn it into a
483 PipelineIR instance.
485 Returns
486 -------
487 pipeline : `PipelineIR`
488 A pipeline generated from the imported pipeline file.
489 """
490 if self.include and self.exclude:
491 raise ValueError(
492 "An include list and an exclude list cannot both be specified"
493 " when declaring a pipeline import."
494 )
495 tmp_pipeline = PipelineIR.from_uri(os.path.expandvars(self.location))
496 if self.instrument is not _Tags.KeepInstrument:
497 tmp_pipeline.instrument = self.instrument
499 included_labels = set()
500 for label in tmp_pipeline.tasks:
501 if (
502 (self.include and label in self.include)
503 or (self.exclude and label not in self.exclude)
504 or (self.include is None and self.exclude is None)
505 ):
506 included_labels.add(label)
508 # Handle labeled subsets being specified in the include or exclude
509 # list, adding or removing labels.
510 if self.include is not None:
511 subsets_in_include = tmp_pipeline.labeled_subsets.keys() & self.include
512 for label in subsets_in_include:
513 included_labels.update(tmp_pipeline.labeled_subsets[label].subset)
515 elif self.exclude is not None:
516 subsets_in_exclude = tmp_pipeline.labeled_subsets.keys() & self.exclude
517 for label in subsets_in_exclude:
518 included_labels.difference_update(tmp_pipeline.labeled_subsets[label].subset)
520 tmp_pipeline = tmp_pipeline.subset_from_labels(included_labels, self.labeledSubsetModifyMode)
522 if not self.importContracts:
523 tmp_pipeline.contracts = []
525 return tmp_pipeline
527 def __eq__(self, other: object) -> bool:
528 if not isinstance(other, ImportIR):
529 return False
530 return all(
531 getattr(self, attr) == getattr(other, attr)
532 for attr in ("location", "include", "exclude", "importContracts")
533 )
536class PipelineIR:
537 """Intermediate representation of a pipeline definition.
539 Parameters
540 ----------
541 loaded_yaml : `dict`
542 A dictionary which matches the structure that would be produced by a
543 yaml reader which parses a pipeline definition document.
545 Raises
546 ------
547 ValueError
548 Raised if:
550 - a pipeline is declared without a description;
551 - no tasks are declared in a pipeline, and no pipelines are to be
552 inherited;
553 - more than one instrument is specified;
554 - more than one inherited pipeline share a label.
555 """
557 def __init__(self, loaded_yaml: dict[str, Any]):
558 # Check required fields are present
559 if "description" not in loaded_yaml:
560 raise ValueError("A pipeline must be declared with a description")
561 if "tasks" not in loaded_yaml and len({"imports", "inherits"} - loaded_yaml.keys()) == 2:
562 raise ValueError("A pipeline must be declared with one or more tasks")
564 # These steps below must happen in this call order
566 # Process pipeline description
567 self.description = loaded_yaml.pop("description")
569 # Process tasks
570 self._read_tasks(loaded_yaml)
572 # Process instrument keys
573 inst = loaded_yaml.pop("instrument", None)
574 if isinstance(inst, list):
575 raise ValueError("Only one top level instrument can be defined in a pipeline")
576 self.instrument: str | None = inst
578 # Process any contracts
579 self._read_contracts(loaded_yaml)
581 # Process any defined parameters
582 self._read_parameters(loaded_yaml)
584 # Process any named label subsets
585 self._read_labeled_subsets(loaded_yaml)
587 # Process any inherited pipelines
588 self._read_imports(loaded_yaml)
590 # verify named subsets, must be done after inheriting
591 self._verify_labeled_subsets()
593 def _read_contracts(self, loaded_yaml: dict[str, Any]) -> None:
594 """Process the contracts portion of the loaded yaml document
596 Parameters
597 ----------
598 loaded_yaml : `dict`
599 A dictionary which matches the structure that would be produced by
600 a yaml reader which parses a pipeline definition document
601 """
602 loaded_contracts = loaded_yaml.pop("contracts", [])
603 if isinstance(loaded_contracts, str):
604 loaded_contracts = [loaded_contracts]
605 self.contracts: list[ContractIR] = []
606 for contract in loaded_contracts:
607 if isinstance(contract, dict):
608 self.contracts.append(ContractIR(**contract))
609 if isinstance(contract, str):
610 self.contracts.append(ContractIR(contract=contract))
612 def _read_parameters(self, loaded_yaml: dict[str, Any]) -> None:
613 """Process the parameters portion of the loaded yaml document
615 Parameters
616 ----------
617 loaded_yaml : `dict`
618 A dictionary which matches the structure that would be produced by
619 a yaml reader which parses a pipeline definition document
620 """
621 loaded_parameters = loaded_yaml.pop("parameters", {})
622 if not isinstance(loaded_parameters, dict):
623 raise ValueError("The parameters section must be a yaml mapping")
624 self.parameters = ParametersIR(loaded_parameters)
626 def _read_labeled_subsets(self, loaded_yaml: dict[str, Any]) -> None:
627 """Process the subsets portion of the loaded yaml document
629 Parameters
630 ----------
631 loaded_yaml : `MutableMapping`
632 A dictionary which matches the structure that would be produced
633 by a yaml reader which parses a pipeline definition document
634 """
635 loaded_subsets = loaded_yaml.pop("subsets", {})
636 self.labeled_subsets: dict[str, LabeledSubset] = {}
637 if not loaded_subsets and "subset" in loaded_yaml:
638 raise ValueError("Top level key should be subsets and not subset, add an s")
639 for key, value in loaded_subsets.items():
640 self.labeled_subsets[key] = LabeledSubset.from_primitives(key, value)
642 def _verify_labeled_subsets(self) -> None:
643 """Verify that all the labels in each named subset exist within the
644 pipeline.
645 """
646 # Verify that all labels defined in a labeled subset are in the
647 # Pipeline
648 for labeled_subset in self.labeled_subsets.values():
649 if not labeled_subset.subset.issubset(self.tasks.keys()):
650 raise ValueError(
651 f"Labels {labeled_subset.subset - self.tasks.keys()} were not found in the "
652 "declared pipeline"
653 )
654 # Verify subset labels are not already task labels
655 label_intersection = self.labeled_subsets.keys() & self.tasks.keys()
656 if label_intersection:
657 raise ValueError(f"Labeled subsets can not use the same label as a task: {label_intersection}")
659 def _read_imports(self, loaded_yaml: dict[str, Any]) -> None:
660 """Process the inherits portion of the loaded yaml document
662 Parameters
663 ----------
664 loaded_yaml : `dict`
665 A dictionary which matches the structure that would be produced by
666 a yaml reader which parses a pipeline definition document
667 """
669 def process_args(argument: str | dict) -> dict:
670 if isinstance(argument, str):
671 return {"location": argument}
672 elif isinstance(argument, dict):
673 if "exclude" in argument and isinstance(argument["exclude"], str):
674 argument["exclude"] = [argument["exclude"]]
675 if "include" in argument and isinstance(argument["include"], str):
676 argument["include"] = [argument["include"]]
677 if "instrument" in argument and argument["instrument"] == "None":
678 argument["instrument"] = None
679 if "labeledSubsetModifyMode" in argument:
680 match argument["labeledSubsetModifyMode"]:
681 case "DROP":
682 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.DROP
683 case "EDIT":
684 argument["labeledSubsetModifyMode"] = PipelineSubsetCtrl.EDIT
685 case unknown:
686 raise ValueError(f"{unknown} is not a valid mode for labeledSubsetModifyMode")
687 return argument
689 if not {"inherits", "imports"} - loaded_yaml.keys():
690 raise ValueError("Cannot define both inherits and imports sections, use imports")
691 tmp_import = loaded_yaml.pop("inherits", None)
692 if tmp_import is None:
693 tmp_import = loaded_yaml.pop("imports", None)
694 else:
695 raise ValueError("The 'inherits' key is not supported. Please use the key 'imports' instead")
696 if tmp_import is None:
697 self.imports: list[ImportIR] = []
698 elif isinstance(tmp_import, list):
699 self.imports = [ImportIR(**process_args(args)) for args in tmp_import]
700 else:
701 self.imports = [ImportIR(**process_args(tmp_import))]
703 self.merge_pipelines([fragment.toPipelineIR() for fragment in self.imports])
705 def merge_pipelines(self, pipelines: Iterable[PipelineIR]) -> None:
706 """Merge one or more other `PipelineIR` objects into this object.
708 Parameters
709 ----------
710 pipelines : `~collections.abc.Iterable` of `PipelineIR` objects
711 An `~collections.abc.Iterable` that contains one or more
712 `PipelineIR` objects to merge into this object.
714 Raises
715 ------
716 ValueError
717 Raised if there is a conflict in instrument specifications.
718 Raised if a task label appears in more than one of the input
719 `PipelineIR` objects which are to be merged.
720 Raised if a labeled subset appears in more than one of the input
721 `PipelineIR` objects which are to be merged, and with any subset
722 existing in this object.
723 """
724 # integrate any imported pipelines
725 accumulate_tasks: dict[str, TaskIR] = {}
726 accumulate_labeled_subsets: dict[str, LabeledSubset] = {}
727 accumulated_parameters = ParametersIR({})
729 for tmp_IR in pipelines:
730 if self.instrument is None:
731 self.instrument = tmp_IR.instrument
732 elif self.instrument != tmp_IR.instrument and tmp_IR.instrument is not None:
733 msg = (
734 "Only one instrument can be declared in a pipeline or its imports. "
735 f"Top level pipeline defines {self.instrument} but pipeline to merge "
736 f"defines {tmp_IR.instrument}."
737 )
738 raise ValueError(msg)
739 if duplicate_labels := accumulate_tasks.keys() & tmp_IR.tasks.keys():
740 msg = (
741 "Task labels in the imported pipelines must be unique. "
742 f"These labels appear multiple times: {duplicate_labels}"
743 )
744 raise ValueError(msg)
745 accumulate_tasks.update(tmp_IR.tasks)
746 self.contracts.extend(tmp_IR.contracts)
747 # verify that tmp_IR has unique labels for named subset among
748 # existing labeled subsets, and with existing task labels.
749 overlapping_subsets = accumulate_labeled_subsets.keys() & tmp_IR.labeled_subsets.keys()
750 task_subset_overlap = (
751 accumulate_labeled_subsets.keys() | tmp_IR.labeled_subsets.keys()
752 ) & accumulate_tasks.keys()
753 if overlapping_subsets or task_subset_overlap:
754 raise ValueError(
755 "Labeled subset names must be unique amongst imports in both labels and "
756 f" named Subsets. Duplicate: {overlapping_subsets | task_subset_overlap}"
757 )
758 accumulate_labeled_subsets.update(tmp_IR.labeled_subsets)
759 accumulated_parameters.update(tmp_IR.parameters)
761 # verify that any accumulated labeled subsets dont clash with a label
762 # from this pipeline
763 if accumulate_labeled_subsets.keys() & self.tasks.keys():
764 raise ValueError(
765 "Labeled subset names must be unique amongst imports in both labels and named Subsets"
766 )
767 # merge in the named subsets for self so this document can override any
768 # that have been delcared
769 accumulate_labeled_subsets.update(self.labeled_subsets)
770 self.labeled_subsets = accumulate_labeled_subsets
772 # merge the dict of label:TaskIR objects, preserving any configs in the
773 # imported pipeline if the labels point to the same class
774 for label, task in self.tasks.items():
775 if label not in accumulate_tasks:
776 accumulate_tasks[label] = task
777 elif accumulate_tasks[label].klass == task.klass:
778 if task.config is not None:
779 for config in task.config:
780 accumulate_tasks[label].add_or_update_config(config)
781 else:
782 accumulate_tasks[label] = task
783 self.tasks: dict[str, TaskIR] = accumulate_tasks
784 accumulated_parameters.update(self.parameters)
785 self.parameters = accumulated_parameters
787 def _read_tasks(self, loaded_yaml: dict[str, Any]) -> None:
788 """Process the tasks portion of the loaded yaml document
790 Parameters
791 ----------
792 loaded_yaml : `dict`
793 A dictionary which matches the structure that would be produced by
794 a yaml reader which parses a pipeline definition document
795 """
796 self.tasks = {}
797 tmp_tasks = loaded_yaml.pop("tasks", None)
798 if tmp_tasks is None:
799 tmp_tasks = {}
801 if "parameters" in tmp_tasks:
802 raise ValueError("parameters is a reserved word and cannot be used as a task label")
804 for label, definition in tmp_tasks.items():
805 if isinstance(definition, str):
806 definition = {"class": definition}
807 config = definition.get("config", None)
808 if config is None:
809 task_config_ir = None
810 else:
811 if isinstance(config, dict):
812 config = [config]
813 task_config_ir = []
814 for c in config:
815 file = c.pop("file", None)
816 if file is None:
817 file = []
818 elif not isinstance(file, list):
819 file = [file]
820 task_config_ir.append(
821 ConfigIR(
822 python=c.pop("python", None), dataId=c.pop("dataId", None), file=file, rest=c
823 )
824 )
825 self.tasks[label] = TaskIR(label, definition["class"], task_config_ir)
827 def _remove_contracts(self, label: str) -> None:
828 """Remove any contracts that contain the given label
830 String comparison used in this way is not the most elegant and may
831 have issues, but it is the only feasible way when users can specify
832 contracts with generic strings.
833 """
834 new_contracts = []
835 for contract in self.contracts:
836 # match a label that is not preceded by an ASCII identifier, or
837 # is the start of a line and is followed by a dot
838 if re.match(f".*([^A-Za-z0-9_]|^){label}[.]", contract.contract):
839 continue
840 new_contracts.append(contract)
841 self.contracts = new_contracts
843 def subset_from_labels(
844 self, labelSpecifier: set[str], subsetCtrl: PipelineSubsetCtrl = PipelineSubsetCtrl.DROP
845 ) -> PipelineIR:
846 """Subset a pipelineIR to contain only labels specified in
847 labelSpecifier.
849 Parameters
850 ----------
851 labelSpecifier : `set` of `str`
852 Set containing labels that describes how to subset a pipeline.
853 subsetCtrl : `PipelineSubsetCtrl`
854 Control object which decides how subsets with missing labels are
855 handled. Setting to `PipelineSubsetCtrl.DROP` (the default) will
856 cause any subsets that have labels which are not in the set of all
857 task labels to be dropped. Setting to `PipelineSubsetCtrl.EDIT`
858 will cause the subset to instead be edited to remove the
859 nonexistent label.
861 Returns
862 -------
863 pipeline : `PipelineIR`
864 A new pipelineIR object that is a subset of the old pipelineIR.
866 Raises
867 ------
868 ValueError
869 Raised if there is an issue with specified labels.
871 Notes
872 -----
873 This method attempts to prune any contracts that contain labels which
874 are not in the declared subset of labels. This pruning is done using a
875 string based matching due to the nature of contracts and may prune more
876 than it should.
877 """
878 pipeline = copy.deepcopy(self)
880 # update the label specifier to expand any named subsets
881 toRemove = set()
882 toAdd = set()
883 for label in labelSpecifier:
884 if label in pipeline.labeled_subsets:
885 toRemove.add(label)
886 toAdd.update(pipeline.labeled_subsets[label].subset)
887 labelSpecifier.difference_update(toRemove)
888 labelSpecifier.update(toAdd)
889 # verify all the labels are in the pipeline
890 if not labelSpecifier.issubset(pipeline.tasks.keys() | pipeline.labeled_subsets):
891 difference = labelSpecifier.difference(pipeline.tasks.keys())
892 raise ValueError(
893 "Not all supplied labels (specified or named subsets) are in the pipeline "
894 f"definition, extra labels: {difference}"
895 )
896 # copy needed so as to not modify while iterating
897 pipeline_labels = set(pipeline.tasks.keys())
898 # Remove the labels from the pipelineIR, and any contracts that contain
899 # those labels (see docstring on _remove_contracts for why this may
900 # cause issues)
901 for label in pipeline_labels:
902 if label not in labelSpecifier:
903 pipeline.tasks.pop(label)
904 pipeline._remove_contracts(label)
906 # create a copy of the object to iterate over
907 labeled_subsets = copy.copy(pipeline.labeled_subsets)
908 # remove any labeled subsets that no longer have a complete set
909 for label, labeled_subset in labeled_subsets.items():
910 if extraTaskLabels := (labeled_subset.subset - pipeline.tasks.keys()):
911 match subsetCtrl:
912 case PipelineSubsetCtrl.DROP:
913 pipeline.labeled_subsets.pop(label)
914 case PipelineSubsetCtrl.EDIT:
915 for extra in extraTaskLabels:
916 labeled_subset.subset.discard(extra)
918 return pipeline
920 @classmethod
921 def from_string(cls, pipeline_string: str) -> PipelineIR:
922 """Create a `PipelineIR` object from a string formatted like a pipeline
923 document.
925 Parameters
926 ----------
927 pipeline_string : `str`
928 A string that is formatted according like a pipeline document.
929 """
930 loaded_yaml = yaml.load(pipeline_string, Loader=PipelineYamlLoader)
931 return cls(loaded_yaml)
933 @classmethod
934 def from_uri(cls, uri: ResourcePathExpression) -> PipelineIR:
935 """Create a `PipelineIR` object from the document specified by the
936 input uri.
938 Parameters
939 ----------
940 uri : convertible to `~lsst.resources.ResourcePath`
941 Location of document to use in creating a `PipelineIR` object.
943 Returns
944 -------
945 pipelineIR : `PipelineIR`
946 The loaded pipeline.
947 """
948 loaded_uri = ResourcePath(uri)
949 with loaded_uri.open("r") as buffer:
950 loaded_yaml = yaml.load(buffer, Loader=PipelineYamlLoader)
951 return cls(loaded_yaml)
953 def write_to_uri(self, uri: ResourcePathExpression) -> None:
954 """Serialize this `PipelineIR` object into a yaml formatted string and
955 write the output to a file at the specified uri.
957 Parameters
958 ----------
959 uri : convertible to `~lsst.resources.ResourcePath`
960 Location of document to write a `PipelineIR` object.
961 """
962 with ResourcePath(uri).open("w") as buffer:
963 yaml.dump(self.to_primitives(), buffer, sort_keys=False, Dumper=MultilineStringDumper)
965 def to_primitives(self) -> dict[str, Any]:
966 """Convert to a representation used in yaml serialization.
968 Returns
969 -------
970 primitives : `dict`
971 Dictionary that maps directly to the serialized YAML form.
972 """
973 accumulate = {"description": self.description}
974 if self.instrument is not None:
975 accumulate["instrument"] = self.instrument
976 if self.parameters:
977 accumulate["parameters"] = self.parameters.to_primitives()
978 accumulate["tasks"] = {m: t.to_primitives() for m, t in self.tasks.items()}
979 if len(self.contracts) > 0:
980 # sort contracts lexicographical order by the contract string in
981 # absence of any other ordering principle
982 contracts_list = [c.to_primitives() for c in self.contracts]
983 contracts_list.sort(key=lambda x: x["contract"])
984 accumulate["contracts"] = contracts_list
985 if self.labeled_subsets:
986 accumulate["subsets"] = {k: v.to_primitives() for k, v in self.labeled_subsets.items()}
987 return accumulate
989 def __str__(self) -> str:
990 """Instance formatting as how it would look in yaml representation"""
991 return yaml.dump(self.to_primitives(), sort_keys=False, Dumper=MultilineStringDumper)
993 def __repr__(self) -> str:
994 """Instance formatting as how it would look in yaml representation"""
995 return str(self)
997 def __eq__(self, other: object) -> bool:
998 if not isinstance(other, PipelineIR):
999 return False
1000 # special case contracts because it is a list, but order is not
1001 # important
1002 return (
1003 all(
1004 getattr(self, attr) == getattr(other, attr)
1005 for attr in ("tasks", "instrument", "labeled_subsets", "parameters")
1006 )
1007 and len(self.contracts) == len(other.contracts)
1008 and all(c in self.contracts for c in other.contracts)
1009 )