Coverage for python/lsst/pipe/base/pipeline.py: 21%
435 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-06-06 10:05 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-06-06 10:05 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining Pipeline class and related methods.
24"""
26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
28import copy
29import logging
30import re
31import urllib.parse
33# -------------------------------
34# Imports of standard modules --
35# -------------------------------
36from dataclasses import dataclass
37from types import MappingProxyType
38from typing import (
39 TYPE_CHECKING,
40 AbstractSet,
41 Callable,
42 ClassVar,
43 Dict,
44 Generator,
45 Iterable,
46 Iterator,
47 Mapping,
48 Optional,
49 Set,
50 Tuple,
51 Type,
52 Union,
53 cast,
54)
56# -----------------------------
57# Imports for other modules --
58from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_full_type_name
63from . import automatic_connection_constants as acc
64from . import pipelineIR, pipeTools
65from ._instrument import Instrument as PipeBaseInstrument
66from ._task_metadata import TaskMetadata
67from .config import PipelineTaskConfig
68from .connections import iterConnections
69from .connectionTypes import Input
70from .pipelineTask import PipelineTask
71from .task import _TASK_METADATA_TYPE
73if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true
74 from lsst.obs.base import Instrument
75 from lsst.pex.config import Config
77# ----------------------------------
78# Local non-exported definitions --
79# ----------------------------------
81_LOG = logging.getLogger(__name__)
83# ------------------------
84# Exported definitions --
85# ------------------------
88@dataclass
89class LabelSpecifier:
90 """A structure to specify a subset of labels to load
92 This structure may contain a set of labels to be used in subsetting a
93 pipeline, or a beginning and end point. Beginning or end may be empty,
94 in which case the range will be a half open interval. Unlike python
95 iteration bounds, end bounds are *INCLUDED*. Note that range based
96 selection is not well defined for pipelines that are not linear in nature,
97 and correct behavior is not guaranteed, or may vary from run to run.
98 """
100 labels: Optional[Set[str]] = None
101 begin: Optional[str] = None
102 end: Optional[str] = None
104 def __post_init__(self) -> None:
105 if self.labels is not None and (self.begin or self.end):
106 raise ValueError(
107 "This struct can only be initialized with a labels set or a begin (and/or) end specifier"
108 )
111class TaskDef:
112 """TaskDef is a collection of information about task needed by Pipeline.
114 The information includes task name, configuration object and optional
115 task class. This class is just a collection of attributes and it exposes
116 all of them so that attributes could potentially be modified in place
117 (e.g. if configuration needs extra overrides).
119 Attributes
120 ----------
121 taskName : `str`, optional
122 The fully-qualified `PipelineTask` class name. If not provided,
123 ``taskClass`` must be.
124 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional
125 Instance of the configuration class corresponding to this task class,
126 usually with all overrides applied. This config will be frozen. If
127 not provided, ``taskClass`` must be provided and
128 ``taskClass.ConfigClass()`` will be used.
129 taskClass : `type`, optional
130 `PipelineTask` class object; if provided and ``taskName`` is as well,
131 the caller guarantees that they are consistent. If not provided,
132 ``taskName`` is used to import the type.
133 label : `str`, optional
134 Task label, usually a short string unique in a pipeline. If not
135 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will
136 be used.
137 """
139 def __init__(
140 self,
141 taskName: Optional[str] = None,
142 config: Optional[PipelineTaskConfig] = None,
143 taskClass: Optional[Type[PipelineTask]] = None,
144 label: Optional[str] = None,
145 ):
146 if taskName is None:
147 if taskClass is None:
148 raise ValueError("At least one of `taskName` and `taskClass` must be provided.")
149 taskName = get_full_type_name(taskClass)
150 elif taskClass is None:
151 taskClass = doImportType(taskName)
152 if config is None:
153 if taskClass is None:
154 raise ValueError("`taskClass` must be provided if `config` is not.")
155 config = taskClass.ConfigClass()
156 if label is None:
157 if taskClass is None:
158 raise ValueError("`taskClass` must be provided if `label` is not.")
159 label = taskClass._DefaultName
160 self.taskName = taskName
161 try:
162 config.validate()
163 except Exception:
164 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
165 raise
166 config.freeze()
167 self.config = config
168 self.taskClass = taskClass
169 self.label = label
170 self.connections = config.connections.ConnectionsClass(config=config)
172 @property
173 def configDatasetName(self) -> str:
174 """Name of a dataset type for configuration of this task (`str`)"""
175 return acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.label)
177 @property
178 def metadataDatasetName(self) -> str:
179 """Name of a dataset type for metadata of this task (`str`)"""
180 return self.makeMetadataDatasetName(self.label)
182 @classmethod
183 def makeMetadataDatasetName(cls, label: str) -> str:
184 """Construct the name of the dataset type for metadata for a task.
186 Parameters
187 ----------
188 label : `str`
189 Label for the task within its pipeline.
191 Returns
192 -------
193 name : `str`
194 Name of the task's metadata dataset type.
195 """
196 return acc.METADATA_OUTPUT_TEMPLATE.format(label=label)
198 @property
199 def logOutputDatasetName(self) -> Optional[str]:
200 """Name of a dataset type for log output from this task, `None` if
201 logs are not to be saved (`str`)
202 """
203 if cast(PipelineTaskConfig, self.config).saveLogOutput:
204 return acc.LOG_OUTPUT_TEMPLATE.format(label=self.label)
205 else:
206 return None
208 def __str__(self) -> str:
209 rep = "TaskDef(" + self.taskName
210 if self.label:
211 rep += ", label=" + self.label
212 rep += ")"
213 return rep
215 def __eq__(self, other: object) -> bool:
216 if not isinstance(other, TaskDef):
217 return False
218 # This does not consider equality of configs when determining equality
219 # as config equality is a difficult thing to define. Should be updated
220 # after DM-27847
221 return self.taskClass == other.taskClass and self.label == other.label
223 def __hash__(self) -> int:
224 return hash((self.taskClass, self.label))
226 @classmethod
227 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef:
228 """Custom callable for unpickling.
230 All arguments are forwarded directly to the constructor; this
231 trampoline is only needed because ``__reduce__`` callables can't be
232 called with keyword arguments.
233 """
234 return cls(taskName=taskName, config=config, label=label)
236 def __reduce__(self) -> Tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], Tuple[str, Config, str]]:
237 return (self._unreduce, (self.taskName, self.config, self.label))
240class Pipeline:
241 """A `Pipeline` is a representation of a series of tasks to run, and the
242 configuration for those tasks.
244 Parameters
245 ----------
246 description : `str`
247 A description of that this pipeline does.
248 """
250 def __init__(self, description: str):
251 pipeline_dict = {"description": description, "tasks": {}}
252 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
254 @classmethod
255 def fromFile(cls, filename: str) -> Pipeline:
256 """Load a pipeline defined in a pipeline yaml file.
258 Parameters
259 ----------
260 filename: `str`
261 A path that points to a pipeline defined in yaml format. This
262 filename may also supply additional labels to be used in
263 subsetting the loaded Pipeline. These labels are separated from
264 the path by a \\#, and may be specified as a comma separated
265 list, or a range denoted as beginning..end. Beginning or end may
266 be empty, in which case the range will be a half open interval.
267 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
268 that range based selection is not well defined for pipelines that
269 are not linear in nature, and correct behavior is not guaranteed,
270 or may vary from run to run.
272 Returns
273 -------
274 pipeline: `Pipeline`
275 The pipeline loaded from specified location with appropriate (if
276 any) subsetting
278 Notes
279 -----
280 This method attempts to prune any contracts that contain labels which
281 are not in the declared subset of labels. This pruning is done using a
282 string based matching due to the nature of contracts and may prune more
283 than it should.
284 """
285 return cls.from_uri(filename)
287 @classmethod
288 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline:
289 """Load a pipeline defined in a pipeline yaml file at a location
290 specified by a URI.
292 Parameters
293 ----------
294 uri : convertible to `ResourcePath`
295 If a string is supplied this should be a URI path that points to a
296 pipeline defined in yaml format, either as a direct path to the
297 yaml file, or as a directory containing a "pipeline.yaml" file (the
298 form used by `write_to_uri` with ``expand=True``). This uri may
299 also supply additional labels to be used in subsetting the loaded
300 Pipeline. These labels are separated from the path by a \\#, and
301 may be specified as a comma separated list, or a range denoted as
302 beginning..end. Beginning or end may be empty, in which case the
303 range will be a half open interval. Unlike python iteration bounds,
304 end bounds are *INCLUDED*. Note that range based selection is not
305 well defined for pipelines that are not linear in nature, and
306 correct behavior is not guaranteed, or may vary from run to run.
307 The same specifiers can be used with a `ResourcePath` object, by
308 being the sole contents in the fragments attribute.
310 Returns
311 -------
312 pipeline : `Pipeline`
313 The pipeline loaded from specified location with appropriate (if
314 any) subsetting
316 Notes
317 -----
318 This method attempts to prune any contracts that contain labels which
319 are not in the declared subset of labels. This pruning is done using a
320 string based matching due to the nature of contracts and may prune more
321 than it should.
322 """
323 # Split up the uri and any labels that were supplied
324 uri, label_specifier = cls._parse_file_specifier(uri)
325 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
327 # If there are labels supplied, only keep those
328 if label_specifier is not None:
329 pipeline = pipeline.subsetFromLabels(label_specifier)
330 return pipeline
332 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
333 """Subset a pipeline to contain only labels specified in labelSpecifier
335 Parameters
336 ----------
337 labelSpecifier : `labelSpecifier`
338 Object containing labels that describes how to subset a pipeline.
340 Returns
341 -------
342 pipeline : `Pipeline`
343 A new pipeline object that is a subset of the old pipeline
345 Raises
346 ------
347 ValueError
348 Raised if there is an issue with specified labels
350 Notes
351 -----
352 This method attempts to prune any contracts that contain labels which
353 are not in the declared subset of labels. This pruning is done using a
354 string based matching due to the nature of contracts and may prune more
355 than it should.
356 """
357 # Labels supplied as a set
358 if labelSpecifier.labels:
359 labelSet = labelSpecifier.labels
360 # Labels supplied as a range, first create a list of all the labels
361 # in the pipeline sorted according to task dependency. Then only
362 # keep labels that lie between the supplied bounds
363 else:
364 # Create a copy of the pipeline to use when assessing the label
365 # ordering. Use a dict for fast searching while preserving order.
366 # Remove contracts so they do not fail in the expansion step. This
367 # is needed because a user may only configure the tasks they intend
368 # to run, which may cause some contracts to fail if they will later
369 # be dropped
370 pipeline = copy.deepcopy(self)
371 pipeline._pipelineIR.contracts = []
372 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
374 # Verify the bounds are in the labels
375 if labelSpecifier.begin is not None:
376 if labelSpecifier.begin not in labels:
377 raise ValueError(
378 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition"
379 )
380 if labelSpecifier.end is not None:
381 if labelSpecifier.end not in labels:
382 raise ValueError(
383 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition"
384 )
386 labelSet = set()
387 for label in labels:
388 if labelSpecifier.begin is not None:
389 if label != labelSpecifier.begin:
390 continue
391 else:
392 labelSpecifier.begin = None
393 labelSet.add(label)
394 if labelSpecifier.end is not None and label == labelSpecifier.end:
395 break
396 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
398 @staticmethod
399 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]:
400 """Split appart a uri and any possible label subsets"""
401 if isinstance(uri, str):
402 # This is to support legacy pipelines during transition
403 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
404 if num_replace:
405 raise ValueError(
406 f"The pipeline file {uri} seems to use the legacy :"
407 " to separate labels, please use # instead."
408 )
409 if uri.count("#") > 1:
410 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
411 # Everything else can be converted directly to ResourcePath.
412 uri = ResourcePath(uri)
413 label_subset = uri.fragment or None
415 specifier: Optional[LabelSpecifier]
416 if label_subset is not None:
417 label_subset = urllib.parse.unquote(label_subset)
418 args: Dict[str, Union[Set[str], str, None]]
419 # labels supplied as a list
420 if "," in label_subset:
421 if ".." in label_subset:
422 raise ValueError(
423 "Can only specify a list of labels or a rangewhen loading a Pipline not both"
424 )
425 args = {"labels": set(label_subset.split(","))}
426 # labels supplied as a range
427 elif ".." in label_subset:
428 # Try to de-structure the labelSubset, this will fail if more
429 # than one range is specified
430 begin, end, *rest = label_subset.split("..")
431 if rest:
432 raise ValueError("Only one range can be specified when loading a pipeline")
433 args = {"begin": begin if begin else None, "end": end if end else None}
434 # Assume anything else is a single label
435 else:
436 args = {"labels": {label_subset}}
438 # MyPy doesn't like how cavalier kwarg construction is with types.
439 specifier = LabelSpecifier(**args) # type: ignore
440 else:
441 specifier = None
443 return uri, specifier
445 @classmethod
446 def fromString(cls, pipeline_string: str) -> Pipeline:
447 """Create a pipeline from string formatted as a pipeline document.
449 Parameters
450 ----------
451 pipeline_string : `str`
452 A string that is formatted according like a pipeline document
454 Returns
455 -------
456 pipeline: `Pipeline`
457 """
458 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
459 return pipeline
461 @classmethod
462 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
463 """Create a pipeline from an already created `PipelineIR` object.
465 Parameters
466 ----------
467 deserialized_pipeline: `PipelineIR`
468 An already created pipeline intermediate representation object
470 Returns
471 -------
472 pipeline: `Pipeline`
473 """
474 pipeline = cls.__new__(cls)
475 pipeline._pipelineIR = deserialized_pipeline
476 return pipeline
478 @classmethod
479 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline:
480 """Create a new pipeline by copying an already existing `Pipeline`.
482 Parameters
483 ----------
484 pipeline: `Pipeline`
485 An already created pipeline intermediate representation object
487 Returns
488 -------
489 pipeline: `Pipeline`
490 """
491 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
493 def __str__(self) -> str:
494 return str(self._pipelineIR)
496 def mergePipeline(self, pipeline: Pipeline) -> None:
497 """Merge another in-memory `Pipeline` object into this one.
499 This merges another pipeline into this object, as if it were declared
500 in the import block of the yaml definition of this pipeline. This
501 modifies this pipeline in place.
503 Parameters
504 ----------
505 pipeline : `Pipeline`
506 The `Pipeline` object that is to be merged into this object.
507 """
508 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,))
510 def addLabelToSubset(self, subset: str, label: str) -> None:
511 """Add a task label from the specified subset.
513 Parameters
514 ----------
515 subset : `str`
516 The labeled subset to modify
517 label : `str`
518 The task label to add to the specified subset.
520 Raises
521 ------
522 ValueError
523 Raised if the specified subset does not exist within the pipeline.
524 Raised if the specified label does not exist within the pipeline.
525 """
526 if label not in self._pipelineIR.tasks:
527 raise ValueError(f"Label {label} does not appear within the pipeline")
528 if subset not in self._pipelineIR.labeled_subsets:
529 raise ValueError(f"Subset {subset} does not appear within the pipeline")
530 self._pipelineIR.labeled_subsets[subset].subset.add(label)
532 def removeLabelFromSubset(self, subset: str, label: str) -> None:
533 """Remove a task label from the specified subset.
535 Parameters
536 ----------
537 subset : `str`
538 The labeled subset to modify
539 label : `str`
540 The task label to remove from the specified subset.
542 Raises
543 ------
544 ValueError
545 Raised if the specified subset does not exist in the pipeline.
546 Raised if the specified label does not exist within the specified
547 subset.
548 """
549 if subset not in self._pipelineIR.labeled_subsets:
550 raise ValueError(f"Subset {subset} does not appear within the pipeline")
551 if label not in self._pipelineIR.labeled_subsets[subset].subset:
552 raise ValueError(f"Label {label} does not appear within the pipeline")
553 self._pipelineIR.labeled_subsets[subset].subset.remove(label)
555 def findSubsetsWithLabel(self, label: str) -> set[str]:
556 """Find any subsets which may contain the specified label.
558 This function returns the name of subsets which return the specified
559 label. May return an empty set if there are no subsets, or no subsets
560 containing the specified label.
562 Parameters
563 ----------
564 label : `str`
565 The task label to use in membership check
567 Returns
568 -------
569 subsets : `set` of `str`
570 Returns a set (possibly empty) of subsets names which contain the
571 specified label.
573 Raises
574 ------
575 ValueError
576 Raised if the specified label does not exist within this pipeline.
577 """
578 results = set()
579 if label not in self._pipelineIR.tasks:
580 raise ValueError(f"Label {label} does not appear within the pipeline")
581 for subset in self._pipelineIR.labeled_subsets.values():
582 if label in subset.subset:
583 results.add(subset.label)
584 return results
586 def addInstrument(self, instrument: Union[Instrument, str]) -> None:
587 """Add an instrument to the pipeline, or replace an instrument that is
588 already defined.
590 Parameters
591 ----------
592 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
593 Either a derived class object of a `lsst.daf.butler.instrument` or
594 a string corresponding to a fully qualified
595 `lsst.daf.butler.instrument` name.
596 """
597 if isinstance(instrument, str):
598 pass
599 else:
600 # TODO: assume that this is a subclass of Instrument, no type
601 # checking
602 instrument = get_full_type_name(instrument)
603 self._pipelineIR.instrument = instrument
605 def getInstrument(self) -> Optional[str]:
606 """Get the instrument from the pipeline.
608 Returns
609 -------
610 instrument : `str`, or None
611 The fully qualified name of a `lsst.obs.base.Instrument` subclass,
612 name, or None if the pipeline does not have an instrument.
613 """
614 return self._pipelineIR.instrument
616 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None:
617 """Add a new task to the pipeline, or replace a task that is already
618 associated with the supplied label.
620 Parameters
621 ----------
622 task: `PipelineTask` or `str`
623 Either a derived class object of a `PipelineTask` or a string
624 corresponding to a fully qualified `PipelineTask` name.
625 label: `str`
626 A label that is used to identify the `PipelineTask` being added
627 """
628 if isinstance(task, str):
629 taskName = task
630 elif issubclass(task, PipelineTask):
631 taskName = get_full_type_name(task)
632 else:
633 raise ValueError(
634 "task must be either a child class of PipelineTask or a string containing"
635 " a fully qualified name to one"
636 )
637 if not label:
638 # in some cases (with command line-generated pipeline) tasks can
639 # be defined without label which is not acceptable, use task
640 # _DefaultName in that case
641 if isinstance(task, str):
642 task_class = doImportType(task)
643 label = task_class._DefaultName
644 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
646 def removeTask(self, label: str) -> None:
647 """Remove a task from the pipeline.
649 Parameters
650 ----------
651 label : `str`
652 The label used to identify the task that is to be removed
654 Raises
655 ------
656 KeyError
657 If no task with that label exists in the pipeline
659 """
660 self._pipelineIR.tasks.pop(label)
662 def addConfigOverride(self, label: str, key: str, value: object) -> None:
663 """Apply single config override.
665 Parameters
666 ----------
667 label : `str`
668 Label of the task.
669 key: `str`
670 Fully-qualified field name.
671 value : object
672 Value to be given to a field.
673 """
674 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
676 def addConfigFile(self, label: str, filename: str) -> None:
677 """Add overrides from a specified file.
679 Parameters
680 ----------
681 label : `str`
682 The label used to identify the task associated with config to
683 modify
684 filename : `str`
685 Path to the override file.
686 """
687 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
689 def addConfigPython(self, label: str, pythonString: str) -> None:
690 """Add Overrides by running a snippet of python code against a config.
692 Parameters
693 ----------
694 label : `str`
695 The label used to identity the task associated with config to
696 modify.
697 pythonString: `str`
698 A string which is valid python code to be executed. This is done
699 with config as the only local accessible value.
700 """
701 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
703 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
704 if label == "parameters":
705 self._pipelineIR.parameters.mapping.update(newConfig.rest)
706 if newConfig.file:
707 raise ValueError("Setting parameters section with config file is not supported")
708 if newConfig.python:
709 raise ValueError("Setting parameters section using python block in unsupported")
710 return
711 if label not in self._pipelineIR.tasks:
712 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
713 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
715 def write_to_uri(self, uri: ResourcePathExpression) -> None:
716 """Write the pipeline to a file or directory.
718 Parameters
719 ----------
720 uri : convertible to `ResourcePath`
721 URI to write to; may have any scheme with `ResourcePath` write
722 support or no scheme for a local file/directory. Should have a
723 ``.yaml``.
724 """
725 self._pipelineIR.write_to_uri(uri)
727 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
728 """Returns a generator of TaskDefs which can be used to create quantum
729 graphs.
731 Returns
732 -------
733 generator : generator of `TaskDef`
734 The generator returned will be the sorted iterator of tasks which
735 are to be used in constructing a quantum graph.
737 Raises
738 ------
739 NotImplementedError
740 If a dataId is supplied in a config block. This is in place for
741 future use
742 """
743 taskDefs = []
744 for label in self._pipelineIR.tasks:
745 taskDefs.append(self._buildTaskDef(label))
747 # lets evaluate the contracts
748 if self._pipelineIR.contracts is not None:
749 label_to_config = {x.label: x.config for x in taskDefs}
750 for contract in self._pipelineIR.contracts:
751 # execute this in its own line so it can raise a good error
752 # message if there was problems with the eval
753 success = eval(contract.contract, None, label_to_config)
754 if not success:
755 extra_info = f": {contract.msg}" if contract.msg is not None else ""
756 raise pipelineIR.ContractError(
757 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}"
758 )
760 taskDefs = sorted(taskDefs, key=lambda x: x.label)
761 yield from pipeTools.orderPipeline(taskDefs)
763 def _buildTaskDef(self, label: str) -> TaskDef:
764 if (taskIR := self._pipelineIR.tasks.get(label)) is None:
765 raise NameError(f"Label {label} does not appear in this pipeline")
766 taskClass: Type[PipelineTask] = doImportType(taskIR.klass)
767 taskName = get_full_type_name(taskClass)
768 config = taskClass.ConfigClass()
769 instrument: PipeBaseInstrument | None = None
770 if (instrumentName := self._pipelineIR.instrument) is not None:
771 instrument_cls: type = doImportType(instrumentName)
772 instrument = instrument_cls()
773 config.applyConfigOverrides(
774 instrument,
775 getattr(taskClass, "_DefaultName", ""),
776 taskIR.config,
777 self._pipelineIR.parameters,
778 label,
779 )
780 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)
782 def __iter__(self) -> Generator[TaskDef, None, None]:
783 return self.toExpandedPipeline()
785 def __getitem__(self, item: str) -> TaskDef:
786 return self._buildTaskDef(item)
788 def __len__(self) -> int:
789 return len(self._pipelineIR.tasks)
791 def __eq__(self, other: object) -> bool:
792 if not isinstance(other, Pipeline):
793 return False
794 elif self._pipelineIR == other._pipelineIR:
795 # Shortcut: if the IR is the same, the expanded pipeline must be
796 # the same as well. But the converse is not true.
797 return True
798 else:
799 self_expanded = {td.label: (td.taskClass,) for td in self}
800 other_expanded = {td.label: (td.taskClass,) for td in other}
801 if self_expanded != other_expanded:
802 return False
803 # After DM-27847, we should compare configuration here, or better,
804 # delegated to TaskDef.__eq__ after making that compare configurations.
805 raise NotImplementedError(
806 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847."
807 )
810@dataclass(frozen=True)
811class TaskDatasetTypes:
812 """An immutable struct that extracts and classifies the dataset types used
813 by a `PipelineTask`
814 """
816 initInputs: NamedValueSet[DatasetType]
817 """Dataset types that are needed as inputs in order to construct this Task.
819 Task-level `initInputs` may be classified as either
820 `~PipelineDatasetTypes.initInputs` or
821 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
822 """
824 initOutputs: NamedValueSet[DatasetType]
825 """Dataset types that may be written after constructing this Task.
827 Task-level `initOutputs` may be classified as either
828 `~PipelineDatasetTypes.initOutputs` or
829 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
830 """
832 inputs: NamedValueSet[DatasetType]
833 """Dataset types that are regular inputs to this Task.
835 If an input dataset needed for a Quantum cannot be found in the input
836 collection(s) or produced by another Task in the Pipeline, that Quantum
837 (and all dependent Quanta) will not be produced.
839 Task-level `inputs` may be classified as either
840 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
841 at the Pipeline level.
842 """
844 queryConstraints: NamedValueSet[DatasetType]
845 """Regular inputs that should not be used as constraints on the initial
846 QuantumGraph generation data ID query, according to their tasks
847 (`NamedValueSet`).
848 """
850 prerequisites: NamedValueSet[DatasetType]
851 """Dataset types that are prerequisite inputs to this Task.
853 Prerequisite inputs must exist in the input collection(s) before the
854 pipeline is run, but do not constrain the graph - if a prerequisite is
855 missing for a Quantum, `PrerequisiteMissingError` is raised.
857 Prerequisite inputs are not resolved until the second stage of
858 QuantumGraph generation.
859 """
861 outputs: NamedValueSet[DatasetType]
862 """Dataset types that are produced by this Task.
864 Task-level `outputs` may be classified as either
865 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
866 at the Pipeline level.
867 """
869 @classmethod
870 def fromTaskDef(
871 cls,
872 taskDef: TaskDef,
873 *,
874 registry: Registry,
875 include_configs: bool = True,
876 storage_class_mapping: Optional[Mapping[str, str]] = None,
877 ) -> TaskDatasetTypes:
878 """Extract and classify the dataset types from a single `PipelineTask`.
880 Parameters
881 ----------
882 taskDef: `TaskDef`
883 An instance of a `TaskDef` class for a particular `PipelineTask`.
884 registry: `Registry`
885 Registry used to construct normalized `DatasetType` objects and
886 retrieve those that are incomplete.
887 include_configs : `bool`, optional
888 If `True` (default) include config dataset types as
889 ``initOutputs``.
890 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional
891 If a taskdef contains a component dataset type that is unknown
892 to the registry, its parent StorageClass will be looked up in this
893 mapping if it is supplied. If the mapping does not contain the
894 composite dataset type, or the mapping is not supplied an exception
895 will be raised.
897 Returns
898 -------
899 types: `TaskDatasetTypes`
900 The dataset types used by this task.
902 Raises
903 ------
904 ValueError
905 Raised if dataset type connection definition differs from
906 registry definition.
907 LookupError
908 Raised if component parent StorageClass could not be determined
909 and storage_class_mapping does not contain the composite type, or
910 is set to None.
911 """
913 def makeDatasetTypesSet(
914 connectionType: str,
915 is_input: bool,
916 freeze: bool = True,
917 ) -> NamedValueSet[DatasetType]:
918 """Constructs a set of true `DatasetType` objects
920 Parameters
921 ----------
922 connectionType : `str`
923 Name of the connection type to produce a set for, corresponds
924 to an attribute of type `list` on the connection class instance
925 is_input : `bool`
926 These are input dataset types, else they are output dataset
927 types.
928 freeze : `bool`, optional
929 If `True`, call `NamedValueSet.freeze` on the object returned.
931 Returns
932 -------
933 datasetTypes : `NamedValueSet`
934 A set of all datasetTypes which correspond to the input
935 connection type specified in the connection class of this
936 `PipelineTask`
938 Raises
939 ------
940 ValueError
941 Raised if dataset type connection definition differs from
942 registry definition.
943 LookupError
944 Raised if component parent StorageClass could not be determined
945 and storage_class_mapping does not contain the composite type,
946 or is set to None.
948 Notes
949 -----
950 This function is a closure over the variables ``registry`` and
951 ``taskDef``, and ``storage_class_mapping``.
952 """
953 datasetTypes = NamedValueSet[DatasetType]()
954 for c in iterConnections(taskDef.connections, connectionType):
955 dimensions = set(getattr(c, "dimensions", set()))
956 if "skypix" in dimensions:
957 try:
958 datasetType = registry.getDatasetType(c.name)
959 except LookupError as err:
960 raise LookupError(
961 f"DatasetType '{c.name}' referenced by "
962 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
963 "placeholder, but does not already exist in the registry. "
964 "Note that reference catalog names are now used as the dataset "
965 "type name instead of 'ref_cat'."
966 ) from err
967 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
968 rest2 = set(
969 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension)
970 )
971 if rest1 != rest2:
972 raise ValueError(
973 f"Non-skypix dimensions for dataset type {c.name} declared in "
974 f"connections ({rest1}) are inconsistent with those in "
975 f"registry's version of this dataset ({rest2})."
976 )
977 else:
978 # Component dataset types are not explicitly in the
979 # registry. This complicates consistency checks with
980 # registry and requires we work out the composite storage
981 # class.
982 registryDatasetType = None
983 try:
984 registryDatasetType = registry.getDatasetType(c.name)
985 except KeyError:
986 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
987 if componentName:
988 if storage_class_mapping is None or compositeName not in storage_class_mapping:
989 raise LookupError(
990 "Component parent class cannot be determined, and "
991 "composite name was not in storage class mapping, or no "
992 "storage_class_mapping was supplied"
993 )
994 else:
995 parentStorageClass = storage_class_mapping[compositeName]
996 else:
997 parentStorageClass = None
998 datasetType = c.makeDatasetType(
999 registry.dimensions, parentStorageClass=parentStorageClass
1000 )
1001 registryDatasetType = datasetType
1002 else:
1003 datasetType = c.makeDatasetType(
1004 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass
1005 )
1007 if registryDatasetType and datasetType != registryDatasetType:
1008 # The dataset types differ but first check to see if
1009 # they are compatible before raising.
1010 if is_input:
1011 # This DatasetType must be compatible on get.
1012 is_compatible = datasetType.is_compatible_with(registryDatasetType)
1013 else:
1014 # Has to be able to be converted to expect type
1015 # on put.
1016 is_compatible = registryDatasetType.is_compatible_with(datasetType)
1017 if is_compatible:
1018 # For inputs we want the pipeline to use the
1019 # pipeline definition, for outputs it should use
1020 # the registry definition.
1021 if not is_input:
1022 datasetType = registryDatasetType
1023 _LOG.debug(
1024 "Dataset types differ (task %s != registry %s) but are compatible"
1025 " for %s in %s.",
1026 datasetType,
1027 registryDatasetType,
1028 "input" if is_input else "output",
1029 taskDef.label,
1030 )
1031 else:
1032 try:
1033 # Explicitly check for storage class just to
1034 # make more specific message.
1035 _ = datasetType.storageClass
1036 except KeyError:
1037 raise ValueError(
1038 "Storage class does not exist for supplied dataset type "
1039 f"{datasetType} for {taskDef.label}."
1040 ) from None
1041 raise ValueError(
1042 f"Supplied dataset type ({datasetType}) inconsistent with "
1043 f"registry definition ({registryDatasetType}) "
1044 f"for {taskDef.label}."
1045 )
1046 datasetTypes.add(datasetType)
1047 if freeze:
1048 datasetTypes.freeze()
1049 return datasetTypes
1051 # optionally add initOutput dataset for config
1052 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False)
1053 if include_configs:
1054 initOutputs.add(
1055 DatasetType(
1056 taskDef.configDatasetName,
1057 registry.dimensions.empty,
1058 storageClass="Config",
1059 )
1060 )
1061 initOutputs.freeze()
1063 # optionally add output dataset for metadata
1064 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False)
1066 # Metadata is supposed to be of the TaskMetadata type, its dimensions
1067 # correspond to a task quantum.
1068 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
1070 # Allow the storage class definition to be read from the existing
1071 # dataset type definition if present.
1072 try:
1073 current = registry.getDatasetType(taskDef.metadataDatasetName)
1074 except KeyError:
1075 # No previous definition so use the default.
1076 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet"
1077 else:
1078 storageClass = current.storageClass.name
1079 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)})
1081 if taskDef.logOutputDatasetName is not None:
1082 # Log output dimensions correspond to a task quantum.
1083 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
1084 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")})
1086 outputs.freeze()
1088 inputs = makeDatasetTypesSet("inputs", is_input=True)
1089 queryConstraints = NamedValueSet(
1090 inputs[c.name]
1091 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs"))
1092 if not c.deferGraphConstraint
1093 )
1095 return cls(
1096 initInputs=makeDatasetTypesSet("initInputs", is_input=True),
1097 initOutputs=initOutputs,
1098 inputs=inputs,
1099 queryConstraints=queryConstraints,
1100 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True),
1101 outputs=outputs,
1102 )
1105@dataclass(frozen=True)
1106class PipelineDatasetTypes:
1107 """An immutable struct that classifies the dataset types used in a
1108 `Pipeline`.
1109 """
1111 packagesDatasetName: ClassVar[str] = "packages"
1112 """Name of a dataset type used to save package versions.
1113 """
1115 initInputs: NamedValueSet[DatasetType]
1116 """Dataset types that are needed as inputs in order to construct the Tasks
1117 in this Pipeline.
1119 This does not include dataset types that are produced when constructing
1120 other Tasks in the Pipeline (these are classified as `initIntermediates`).
1121 """
1123 initOutputs: NamedValueSet[DatasetType]
1124 """Dataset types that may be written after constructing the Tasks in this
1125 Pipeline.
1127 This does not include dataset types that are also used as inputs when
1128 constructing other Tasks in the Pipeline (these are classified as
1129 `initIntermediates`).
1130 """
1132 initIntermediates: NamedValueSet[DatasetType]
1133 """Dataset types that are both used when constructing one or more Tasks
1134 in the Pipeline and produced as a side-effect of constructing another
1135 Task in the Pipeline.
1136 """
1138 inputs: NamedValueSet[DatasetType]
1139 """Dataset types that are regular inputs for the full pipeline.
1141 If an input dataset needed for a Quantum cannot be found in the input
1142 collection(s), that Quantum (and all dependent Quanta) will not be
1143 produced.
1144 """
1146 queryConstraints: NamedValueSet[DatasetType]
1147 """Regular inputs that should be used as constraints on the initial
1148 QuantumGraph generation data ID query, according to their tasks
1149 (`NamedValueSet`).
1150 """
1152 prerequisites: NamedValueSet[DatasetType]
1153 """Dataset types that are prerequisite inputs for the full Pipeline.
1155 Prerequisite inputs must exist in the input collection(s) before the
1156 pipeline is run, but do not constrain the graph - if a prerequisite is
1157 missing for a Quantum, `PrerequisiteMissingError` is raised.
1159 Prerequisite inputs are not resolved until the second stage of
1160 QuantumGraph generation.
1161 """
1163 intermediates: NamedValueSet[DatasetType]
1164 """Dataset types that are output by one Task in the Pipeline and consumed
1165 as inputs by one or more other Tasks in the Pipeline.
1166 """
1168 outputs: NamedValueSet[DatasetType]
1169 """Dataset types that are output by a Task in the Pipeline and not consumed
1170 by any other Task in the Pipeline.
1171 """
1173 byTask: Mapping[str, TaskDatasetTypes]
1174 """Per-Task dataset types, keyed by label in the `Pipeline`.
1176 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
1177 neither has been modified since the dataset types were extracted, of
1178 course).
1179 """
1181 @classmethod
1182 def fromPipeline(
1183 cls,
1184 pipeline: Union[Pipeline, Iterable[TaskDef]],
1185 *,
1186 registry: Registry,
1187 include_configs: bool = True,
1188 include_packages: bool = True,
1189 ) -> PipelineDatasetTypes:
1190 """Extract and classify the dataset types from all tasks in a
1191 `Pipeline`.
1193 Parameters
1194 ----------
1195 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1196 A collection of tasks that can be run together.
1197 registry: `Registry`
1198 Registry used to construct normalized `DatasetType` objects and
1199 retrieve those that are incomplete.
1200 include_configs : `bool`, optional
1201 If `True` (default) include config dataset types as
1202 ``initOutputs``.
1203 include_packages : `bool`, optional
1204 If `True` (default) include the dataset type for software package
1205 versions in ``initOutputs``.
1207 Returns
1208 -------
1209 types: `PipelineDatasetTypes`
1210 The dataset types used by this `Pipeline`.
1212 Raises
1213 ------
1214 ValueError
1215 Raised if Tasks are inconsistent about which datasets are marked
1216 prerequisite. This indicates that the Tasks cannot be run as part
1217 of the same `Pipeline`.
1218 """
1219 allInputs = NamedValueSet[DatasetType]()
1220 allOutputs = NamedValueSet[DatasetType]()
1221 allInitInputs = NamedValueSet[DatasetType]()
1222 allInitOutputs = NamedValueSet[DatasetType]()
1223 prerequisites = NamedValueSet[DatasetType]()
1224 queryConstraints = NamedValueSet[DatasetType]()
1225 byTask = dict()
1226 if include_packages:
1227 allInitOutputs.add(
1228 DatasetType(
1229 cls.packagesDatasetName,
1230 registry.dimensions.empty,
1231 storageClass="Packages",
1232 )
1233 )
1234 # create a list of TaskDefs in case the input is a generator
1235 pipeline = list(pipeline)
1237 # collect all the output dataset types
1238 typeStorageclassMap: Dict[str, str] = {}
1239 for taskDef in pipeline:
1240 for outConnection in iterConnections(taskDef.connections, "outputs"):
1241 typeStorageclassMap[outConnection.name] = outConnection.storageClass
1243 for taskDef in pipeline:
1244 thisTask = TaskDatasetTypes.fromTaskDef(
1245 taskDef,
1246 registry=registry,
1247 include_configs=include_configs,
1248 storage_class_mapping=typeStorageclassMap,
1249 )
1250 allInitInputs.update(thisTask.initInputs)
1251 allInitOutputs.update(thisTask.initOutputs)
1252 allInputs.update(thisTask.inputs)
1253 # Inputs are query constraints if any task considers them a query
1254 # constraint.
1255 queryConstraints.update(thisTask.queryConstraints)
1256 prerequisites.update(thisTask.prerequisites)
1257 allOutputs.update(thisTask.outputs)
1258 byTask[taskDef.label] = thisTask
1259 if not prerequisites.isdisjoint(allInputs):
1260 raise ValueError(
1261 "{} marked as both prerequisites and regular inputs".format(
1262 {dt.name for dt in allInputs & prerequisites}
1263 )
1264 )
1265 if not prerequisites.isdisjoint(allOutputs):
1266 raise ValueError(
1267 "{} marked as both prerequisites and outputs".format(
1268 {dt.name for dt in allOutputs & prerequisites}
1269 )
1270 )
1271 # Make sure that components which are marked as inputs get treated as
1272 # intermediates if there is an output which produces the composite
1273 # containing the component
1274 intermediateComponents = NamedValueSet[DatasetType]()
1275 intermediateComposites = NamedValueSet[DatasetType]()
1276 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
1277 for dsType in allInputs:
1278 # get the name of a possible component
1279 name, component = dsType.nameAndComponent()
1280 # if there is a component name, that means this is a component
1281 # DatasetType, if there is an output which produces the parent of
1282 # this component, treat this input as an intermediate
1283 if component is not None:
1284 # This needs to be in this if block, because someone might have
1285 # a composite that is a pure input from existing data
1286 if name in outputNameMapping:
1287 intermediateComponents.add(dsType)
1288 intermediateComposites.add(outputNameMapping[name])
1290 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None:
1291 common = a.names & b.names
1292 for name in common:
1293 # Any compatibility is allowed. This function does not know
1294 # if a dataset type is to be used for input or output.
1295 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])):
1296 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
1298 checkConsistency(allInitInputs, allInitOutputs)
1299 checkConsistency(allInputs, allOutputs)
1300 checkConsistency(allInputs, intermediateComposites)
1301 checkConsistency(allOutputs, intermediateComposites)
1303 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]:
1304 assert isinstance(s, NamedValueSet)
1305 s.freeze()
1306 return s
1308 inputs = frozen(allInputs - allOutputs - intermediateComponents)
1310 return cls(
1311 initInputs=frozen(allInitInputs - allInitOutputs),
1312 initIntermediates=frozen(allInitInputs & allInitOutputs),
1313 initOutputs=frozen(allInitOutputs - allInitInputs),
1314 inputs=inputs,
1315 queryConstraints=frozen(queryConstraints & inputs),
1316 # If there are storage class differences in inputs and outputs
1317 # the intermediates have to choose priority. Here choose that
1318 # inputs to tasks much match the requested storage class by
1319 # applying the inputs over the top of the outputs.
1320 intermediates=frozen(allOutputs & allInputs | intermediateComponents),
1321 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1322 prerequisites=frozen(prerequisites),
1323 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1324 )
1326 @classmethod
1327 def initOutputNames(
1328 cls,
1329 pipeline: Union[Pipeline, Iterable[TaskDef]],
1330 *,
1331 include_configs: bool = True,
1332 include_packages: bool = True,
1333 ) -> Iterator[str]:
1334 """Return the names of dataset types ot task initOutputs, Configs,
1335 and package versions for a pipeline.
1337 Parameters
1338 ----------
1339 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1340 A `Pipeline` instance or collection of `TaskDef` instances.
1341 include_configs : `bool`, optional
1342 If `True` (default) include config dataset types.
1343 include_packages : `bool`, optional
1344 If `True` (default) include the dataset type for package versions.
1346 Yields
1347 ------
1348 datasetTypeName : `str`
1349 Name of the dataset type.
1350 """
1351 if include_packages:
1352 # Package versions dataset type
1353 yield cls.packagesDatasetName
1355 if isinstance(pipeline, Pipeline):
1356 pipeline = pipeline.toExpandedPipeline()
1358 for taskDef in pipeline:
1359 # all task InitOutputs
1360 for name in taskDef.connections.initOutputs:
1361 attribute = getattr(taskDef.connections, name)
1362 yield attribute.name
1364 # config dataset name
1365 if include_configs:
1366 yield taskDef.configDatasetName