Coverage for python/lsst/pipe/base/pipeline.py: 21%
438 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-12 02:03 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-12 02:03 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining Pipeline class and related methods.
24"""
26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
28import copy
29import logging
30import re
31import urllib.parse
33# -------------------------------
34# Imports of standard modules --
35# -------------------------------
36from dataclasses import dataclass
37from types import MappingProxyType
38from typing import (
39 TYPE_CHECKING,
40 AbstractSet,
41 Callable,
42 ClassVar,
43 Dict,
44 Generator,
45 Iterable,
46 Iterator,
47 Mapping,
48 Optional,
49 Set,
50 Tuple,
51 Type,
52 Union,
53 cast,
54)
56# -----------------------------
57# Imports for other modules --
58from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_full_type_name
63from . import automatic_connection_constants as acc
64from . import pipelineIR, pipeTools
65from ._instrument import Instrument as PipeBaseInstrument
66from ._task_metadata import TaskMetadata
67from .config import PipelineTaskConfig
68from .connections import iterConnections
69from .connectionTypes import Input
70from .pipelineTask import PipelineTask
71from .task import _TASK_METADATA_TYPE
73if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true
74 from lsst.obs.base import Instrument
75 from lsst.pex.config import Config
77# ----------------------------------
78# Local non-exported definitions --
79# ----------------------------------
81_LOG = logging.getLogger(__name__)
83# ------------------------
84# Exported definitions --
85# ------------------------
88@dataclass
89class LabelSpecifier:
90 """A structure to specify a subset of labels to load
92 This structure may contain a set of labels to be used in subsetting a
93 pipeline, or a beginning and end point. Beginning or end may be empty,
94 in which case the range will be a half open interval. Unlike python
95 iteration bounds, end bounds are *INCLUDED*. Note that range based
96 selection is not well defined for pipelines that are not linear in nature,
97 and correct behavior is not guaranteed, or may vary from run to run.
98 """
100 labels: Optional[Set[str]] = None
101 begin: Optional[str] = None
102 end: Optional[str] = None
104 def __post_init__(self) -> None:
105 if self.labels is not None and (self.begin or self.end):
106 raise ValueError(
107 "This struct can only be initialized with a labels set or a begin (and/or) end specifier"
108 )
111class TaskDef:
112 """TaskDef is a collection of information about task needed by Pipeline.
114 The information includes task name, configuration object and optional
115 task class. This class is just a collection of attributes and it exposes
116 all of them so that attributes could potentially be modified in place
117 (e.g. if configuration needs extra overrides).
119 Attributes
120 ----------
121 taskName : `str`, optional
122 The fully-qualified `PipelineTask` class name. If not provided,
123 ``taskClass`` must be.
124 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional
125 Instance of the configuration class corresponding to this task class,
126 usually with all overrides applied. This config will be frozen. If
127 not provided, ``taskClass`` must be provided and
128 ``taskClass.ConfigClass()`` will be used.
129 taskClass : `type`, optional
130 `PipelineTask` class object; if provided and ``taskName`` is as well,
131 the caller guarantees that they are consistent. If not provided,
132 ``taskName`` is used to import the type.
133 label : `str`, optional
134 Task label, usually a short string unique in a pipeline. If not
135 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will
136 be used.
137 """
139 def __init__(
140 self,
141 taskName: Optional[str] = None,
142 config: Optional[PipelineTaskConfig] = None,
143 taskClass: Optional[Type[PipelineTask]] = None,
144 label: Optional[str] = None,
145 ):
146 if taskName is None:
147 if taskClass is None:
148 raise ValueError("At least one of `taskName` and `taskClass` must be provided.")
149 taskName = get_full_type_name(taskClass)
150 elif taskClass is None:
151 taskClass = doImportType(taskName)
152 if config is None:
153 if taskClass is None:
154 raise ValueError("`taskClass` must be provided if `config` is not.")
155 config = taskClass.ConfigClass()
156 if label is None:
157 if taskClass is None:
158 raise ValueError("`taskClass` must be provided if `label` is not.")
159 label = taskClass._DefaultName
160 self.taskName = taskName
161 try:
162 config.validate()
163 except Exception:
164 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
165 raise
166 config.freeze()
167 self.config = config
168 self.taskClass = taskClass
169 self.label = label
170 self.connections = config.connections.ConnectionsClass(config=config)
172 @property
173 def configDatasetName(self) -> str:
174 """Name of a dataset type for configuration of this task (`str`)"""
175 return acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.label)
177 @property
178 def metadataDatasetName(self) -> Optional[str]:
179 """Name of a dataset type for metadata of this task, `None` if
180 metadata is not to be saved (`str`)
181 """
182 if self.config.saveMetadata:
183 return self.makeMetadataDatasetName(self.label)
184 else:
185 return None
187 @classmethod
188 def makeMetadataDatasetName(cls, label: str) -> str:
189 """Construct the name of the dataset type for metadata for a task.
191 Parameters
192 ----------
193 label : `str`
194 Label for the task within its pipeline.
196 Returns
197 -------
198 name : `str`
199 Name of the task's metadata dataset type.
200 """
201 return acc.METADATA_OUTPUT_TEMPLATE.format(label=label)
203 @property
204 def logOutputDatasetName(self) -> Optional[str]:
205 """Name of a dataset type for log output from this task, `None` if
206 logs are not to be saved (`str`)
207 """
208 if cast(PipelineTaskConfig, self.config).saveLogOutput:
209 return acc.LOG_OUTPUT_TEMPLATE.format(label=self.label)
210 else:
211 return None
213 def __str__(self) -> str:
214 rep = "TaskDef(" + self.taskName
215 if self.label:
216 rep += ", label=" + self.label
217 rep += ")"
218 return rep
220 def __eq__(self, other: object) -> bool:
221 if not isinstance(other, TaskDef):
222 return False
223 # This does not consider equality of configs when determining equality
224 # as config equality is a difficult thing to define. Should be updated
225 # after DM-27847
226 return self.taskClass == other.taskClass and self.label == other.label
228 def __hash__(self) -> int:
229 return hash((self.taskClass, self.label))
231 @classmethod
232 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef:
233 """Custom callable for unpickling.
235 All arguments are forwarded directly to the constructor; this
236 trampoline is only needed because ``__reduce__`` callables can't be
237 called with keyword arguments.
238 """
239 return cls(taskName=taskName, config=config, label=label)
241 def __reduce__(self) -> Tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], Tuple[str, Config, str]]:
242 return (self._unreduce, (self.taskName, self.config, self.label))
245class Pipeline:
246 """A `Pipeline` is a representation of a series of tasks to run, and the
247 configuration for those tasks.
249 Parameters
250 ----------
251 description : `str`
252 A description of that this pipeline does.
253 """
255 def __init__(self, description: str):
256 pipeline_dict = {"description": description, "tasks": {}}
257 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
259 @classmethod
260 def fromFile(cls, filename: str) -> Pipeline:
261 """Load a pipeline defined in a pipeline yaml file.
263 Parameters
264 ----------
265 filename: `str`
266 A path that points to a pipeline defined in yaml format. This
267 filename may also supply additional labels to be used in
268 subsetting the loaded Pipeline. These labels are separated from
269 the path by a \\#, and may be specified as a comma separated
270 list, or a range denoted as beginning..end. Beginning or end may
271 be empty, in which case the range will be a half open interval.
272 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
273 that range based selection is not well defined for pipelines that
274 are not linear in nature, and correct behavior is not guaranteed,
275 or may vary from run to run.
277 Returns
278 -------
279 pipeline: `Pipeline`
280 The pipeline loaded from specified location with appropriate (if
281 any) subsetting
283 Notes
284 -----
285 This method attempts to prune any contracts that contain labels which
286 are not in the declared subset of labels. This pruning is done using a
287 string based matching due to the nature of contracts and may prune more
288 than it should.
289 """
290 return cls.from_uri(filename)
292 @classmethod
293 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline:
294 """Load a pipeline defined in a pipeline yaml file at a location
295 specified by a URI.
297 Parameters
298 ----------
299 uri : convertible to `ResourcePath`
300 If a string is supplied this should be a URI path that points to a
301 pipeline defined in yaml format, either as a direct path to the
302 yaml file, or as a directory containing a "pipeline.yaml" file (the
303 form used by `write_to_uri` with ``expand=True``). This uri may
304 also supply additional labels to be used in subsetting the loaded
305 Pipeline. These labels are separated from the path by a \\#, and
306 may be specified as a comma separated list, or a range denoted as
307 beginning..end. Beginning or end may be empty, in which case the
308 range will be a half open interval. Unlike python iteration bounds,
309 end bounds are *INCLUDED*. Note that range based selection is not
310 well defined for pipelines that are not linear in nature, and
311 correct behavior is not guaranteed, or may vary from run to run.
312 The same specifiers can be used with a `ResourcePath` object, by
313 being the sole contents in the fragments attribute.
315 Returns
316 -------
317 pipeline : `Pipeline`
318 The pipeline loaded from specified location with appropriate (if
319 any) subsetting
321 Notes
322 -----
323 This method attempts to prune any contracts that contain labels which
324 are not in the declared subset of labels. This pruning is done using a
325 string based matching due to the nature of contracts and may prune more
326 than it should.
327 """
328 # Split up the uri and any labels that were supplied
329 uri, label_specifier = cls._parse_file_specifier(uri)
330 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
332 # If there are labels supplied, only keep those
333 if label_specifier is not None:
334 pipeline = pipeline.subsetFromLabels(label_specifier)
335 return pipeline
337 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
338 """Subset a pipeline to contain only labels specified in labelSpecifier
340 Parameters
341 ----------
342 labelSpecifier : `labelSpecifier`
343 Object containing labels that describes how to subset a pipeline.
345 Returns
346 -------
347 pipeline : `Pipeline`
348 A new pipeline object that is a subset of the old pipeline
350 Raises
351 ------
352 ValueError
353 Raised if there is an issue with specified labels
355 Notes
356 -----
357 This method attempts to prune any contracts that contain labels which
358 are not in the declared subset of labels. This pruning is done using a
359 string based matching due to the nature of contracts and may prune more
360 than it should.
361 """
362 # Labels supplied as a set
363 if labelSpecifier.labels:
364 labelSet = labelSpecifier.labels
365 # Labels supplied as a range, first create a list of all the labels
366 # in the pipeline sorted according to task dependency. Then only
367 # keep labels that lie between the supplied bounds
368 else:
369 # Create a copy of the pipeline to use when assessing the label
370 # ordering. Use a dict for fast searching while preserving order.
371 # Remove contracts so they do not fail in the expansion step. This
372 # is needed because a user may only configure the tasks they intend
373 # to run, which may cause some contracts to fail if they will later
374 # be dropped
375 pipeline = copy.deepcopy(self)
376 pipeline._pipelineIR.contracts = []
377 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
379 # Verify the bounds are in the labels
380 if labelSpecifier.begin is not None:
381 if labelSpecifier.begin not in labels:
382 raise ValueError(
383 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition"
384 )
385 if labelSpecifier.end is not None:
386 if labelSpecifier.end not in labels:
387 raise ValueError(
388 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition"
389 )
391 labelSet = set()
392 for label in labels:
393 if labelSpecifier.begin is not None:
394 if label != labelSpecifier.begin:
395 continue
396 else:
397 labelSpecifier.begin = None
398 labelSet.add(label)
399 if labelSpecifier.end is not None and label == labelSpecifier.end:
400 break
401 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
403 @staticmethod
404 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]:
405 """Split appart a uri and any possible label subsets"""
406 if isinstance(uri, str):
407 # This is to support legacy pipelines during transition
408 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
409 if num_replace:
410 raise ValueError(
411 f"The pipeline file {uri} seems to use the legacy :"
412 " to separate labels, please use # instead."
413 )
414 if uri.count("#") > 1:
415 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
416 # Everything else can be converted directly to ResourcePath.
417 uri = ResourcePath(uri)
418 label_subset = uri.fragment or None
420 specifier: Optional[LabelSpecifier]
421 if label_subset is not None:
422 label_subset = urllib.parse.unquote(label_subset)
423 args: Dict[str, Union[Set[str], str, None]]
424 # labels supplied as a list
425 if "," in label_subset:
426 if ".." in label_subset:
427 raise ValueError(
428 "Can only specify a list of labels or a rangewhen loading a Pipline not both"
429 )
430 args = {"labels": set(label_subset.split(","))}
431 # labels supplied as a range
432 elif ".." in label_subset:
433 # Try to de-structure the labelSubset, this will fail if more
434 # than one range is specified
435 begin, end, *rest = label_subset.split("..")
436 if rest:
437 raise ValueError("Only one range can be specified when loading a pipeline")
438 args = {"begin": begin if begin else None, "end": end if end else None}
439 # Assume anything else is a single label
440 else:
441 args = {"labels": {label_subset}}
443 # MyPy doesn't like how cavalier kwarg construction is with types.
444 specifier = LabelSpecifier(**args) # type: ignore
445 else:
446 specifier = None
448 return uri, specifier
450 @classmethod
451 def fromString(cls, pipeline_string: str) -> Pipeline:
452 """Create a pipeline from string formatted as a pipeline document.
454 Parameters
455 ----------
456 pipeline_string : `str`
457 A string that is formatted according like a pipeline document
459 Returns
460 -------
461 pipeline: `Pipeline`
462 """
463 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
464 return pipeline
466 @classmethod
467 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
468 """Create a pipeline from an already created `PipelineIR` object.
470 Parameters
471 ----------
472 deserialized_pipeline: `PipelineIR`
473 An already created pipeline intermediate representation object
475 Returns
476 -------
477 pipeline: `Pipeline`
478 """
479 pipeline = cls.__new__(cls)
480 pipeline._pipelineIR = deserialized_pipeline
481 return pipeline
483 @classmethod
484 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline:
485 """Create a new pipeline by copying an already existing `Pipeline`.
487 Parameters
488 ----------
489 pipeline: `Pipeline`
490 An already created pipeline intermediate representation object
492 Returns
493 -------
494 pipeline: `Pipeline`
495 """
496 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
498 def __str__(self) -> str:
499 return str(self._pipelineIR)
501 def mergePipeline(self, pipeline: Pipeline) -> None:
502 """Merge another in-memory `Pipeline` object into this one.
504 This merges another pipeline into this object, as if it were declared
505 in the import block of the yaml definition of this pipeline. This
506 modifies this pipeline in place.
508 Parameters
509 ----------
510 pipeline : `Pipeline`
511 The `Pipeline` object that is to be merged into this object.
512 """
513 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,))
515 def addLabelToSubset(self, subset: str, label: str) -> None:
516 """Add a task label from the specified subset.
518 Parameters
519 ----------
520 subset : `str`
521 The labeled subset to modify
522 label : `str`
523 The task label to add to the specified subset.
525 Raises
526 ------
527 ValueError
528 Raised if the specified subset does not exist within the pipeline.
529 Raised if the specified label does not exist within the pipeline.
530 """
531 if label not in self._pipelineIR.tasks:
532 raise ValueError(f"Label {label} does not appear within the pipeline")
533 if subset not in self._pipelineIR.labeled_subsets:
534 raise ValueError(f"Subset {subset} does not appear within the pipeline")
535 self._pipelineIR.labeled_subsets[subset].subset.add(label)
537 def removeLabelFromSubset(self, subset: str, label: str) -> None:
538 """Remove a task label from the specified subset.
540 Parameters
541 ----------
542 subset : `str`
543 The labeled subset to modify
544 label : `str`
545 The task label to remove from the specified subset.
547 Raises
548 ------
549 ValueError
550 Raised if the specified subset does not exist in the pipeline.
551 Raised if the specified label does not exist within the specified
552 subset.
553 """
554 if subset not in self._pipelineIR.labeled_subsets:
555 raise ValueError(f"Subset {subset} does not appear within the pipeline")
556 if label not in self._pipelineIR.labeled_subsets[subset].subset:
557 raise ValueError(f"Label {label} does not appear within the pipeline")
558 self._pipelineIR.labeled_subsets[subset].subset.remove(label)
560 def findSubsetsWithLabel(self, label: str) -> set[str]:
561 """Find any subsets which may contain the specified label.
563 This function returns the name of subsets which return the specified
564 label. May return an empty set if there are no subsets, or no subsets
565 containing the specified label.
567 Parameters
568 ----------
569 label : `str`
570 The task label to use in membership check
572 Returns
573 -------
574 subsets : `set` of `str`
575 Returns a set (possibly empty) of subsets names which contain the
576 specified label.
578 Raises
579 ------
580 ValueError
581 Raised if the specified label does not exist within this pipeline.
582 """
583 results = set()
584 if label not in self._pipelineIR.tasks:
585 raise ValueError(f"Label {label} does not appear within the pipeline")
586 for subset in self._pipelineIR.labeled_subsets.values():
587 if label in subset.subset:
588 results.add(subset.label)
589 return results
591 def addInstrument(self, instrument: Union[Instrument, str]) -> None:
592 """Add an instrument to the pipeline, or replace an instrument that is
593 already defined.
595 Parameters
596 ----------
597 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
598 Either a derived class object of a `lsst.daf.butler.instrument` or
599 a string corresponding to a fully qualified
600 `lsst.daf.butler.instrument` name.
601 """
602 if isinstance(instrument, str):
603 pass
604 else:
605 # TODO: assume that this is a subclass of Instrument, no type
606 # checking
607 instrument = get_full_type_name(instrument)
608 self._pipelineIR.instrument = instrument
610 def getInstrument(self) -> Optional[str]:
611 """Get the instrument from the pipeline.
613 Returns
614 -------
615 instrument : `str`, or None
616 The fully qualified name of a `lsst.obs.base.Instrument` subclass,
617 name, or None if the pipeline does not have an instrument.
618 """
619 return self._pipelineIR.instrument
621 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None:
622 """Add a new task to the pipeline, or replace a task that is already
623 associated with the supplied label.
625 Parameters
626 ----------
627 task: `PipelineTask` or `str`
628 Either a derived class object of a `PipelineTask` or a string
629 corresponding to a fully qualified `PipelineTask` name.
630 label: `str`
631 A label that is used to identify the `PipelineTask` being added
632 """
633 if isinstance(task, str):
634 taskName = task
635 elif issubclass(task, PipelineTask):
636 taskName = get_full_type_name(task)
637 else:
638 raise ValueError(
639 "task must be either a child class of PipelineTask or a string containing"
640 " a fully qualified name to one"
641 )
642 if not label:
643 # in some cases (with command line-generated pipeline) tasks can
644 # be defined without label which is not acceptable, use task
645 # _DefaultName in that case
646 if isinstance(task, str):
647 task_class = doImportType(task)
648 label = task_class._DefaultName
649 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
651 def removeTask(self, label: str) -> None:
652 """Remove a task from the pipeline.
654 Parameters
655 ----------
656 label : `str`
657 The label used to identify the task that is to be removed
659 Raises
660 ------
661 KeyError
662 If no task with that label exists in the pipeline
664 """
665 self._pipelineIR.tasks.pop(label)
667 def addConfigOverride(self, label: str, key: str, value: object) -> None:
668 """Apply single config override.
670 Parameters
671 ----------
672 label : `str`
673 Label of the task.
674 key: `str`
675 Fully-qualified field name.
676 value : object
677 Value to be given to a field.
678 """
679 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
681 def addConfigFile(self, label: str, filename: str) -> None:
682 """Add overrides from a specified file.
684 Parameters
685 ----------
686 label : `str`
687 The label used to identify the task associated with config to
688 modify
689 filename : `str`
690 Path to the override file.
691 """
692 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
694 def addConfigPython(self, label: str, pythonString: str) -> None:
695 """Add Overrides by running a snippet of python code against a config.
697 Parameters
698 ----------
699 label : `str`
700 The label used to identity the task associated with config to
701 modify.
702 pythonString: `str`
703 A string which is valid python code to be executed. This is done
704 with config as the only local accessible value.
705 """
706 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
708 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
709 if label == "parameters":
710 self._pipelineIR.parameters.mapping.update(newConfig.rest)
711 if newConfig.file:
712 raise ValueError("Setting parameters section with config file is not supported")
713 if newConfig.python:
714 raise ValueError("Setting parameters section using python block in unsupported")
715 return
716 if label not in self._pipelineIR.tasks:
717 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
718 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
720 def write_to_uri(self, uri: ResourcePathExpression) -> None:
721 """Write the pipeline to a file or directory.
723 Parameters
724 ----------
725 uri : convertible to `ResourcePath`
726 URI to write to; may have any scheme with `ResourcePath` write
727 support or no scheme for a local file/directory. Should have a
728 ``.yaml``.
729 """
730 self._pipelineIR.write_to_uri(uri)
732 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
733 """Returns a generator of TaskDefs which can be used to create quantum
734 graphs.
736 Returns
737 -------
738 generator : generator of `TaskDef`
739 The generator returned will be the sorted iterator of tasks which
740 are to be used in constructing a quantum graph.
742 Raises
743 ------
744 NotImplementedError
745 If a dataId is supplied in a config block. This is in place for
746 future use
747 """
748 taskDefs = []
749 for label in self._pipelineIR.tasks:
750 taskDefs.append(self._buildTaskDef(label))
752 # lets evaluate the contracts
753 if self._pipelineIR.contracts is not None:
754 label_to_config = {x.label: x.config for x in taskDefs}
755 for contract in self._pipelineIR.contracts:
756 # execute this in its own line so it can raise a good error
757 # message if there was problems with the eval
758 success = eval(contract.contract, None, label_to_config)
759 if not success:
760 extra_info = f": {contract.msg}" if contract.msg is not None else ""
761 raise pipelineIR.ContractError(
762 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}"
763 )
765 taskDefs = sorted(taskDefs, key=lambda x: x.label)
766 yield from pipeTools.orderPipeline(taskDefs)
768 def _buildTaskDef(self, label: str) -> TaskDef:
769 if (taskIR := self._pipelineIR.tasks.get(label)) is None:
770 raise NameError(f"Label {label} does not appear in this pipeline")
771 taskClass: Type[PipelineTask] = doImportType(taskIR.klass)
772 taskName = get_full_type_name(taskClass)
773 config = taskClass.ConfigClass()
774 instrument: PipeBaseInstrument | None = None
775 if (instrumentName := self._pipelineIR.instrument) is not None:
776 instrument_cls: type = doImportType(instrumentName)
777 instrument = instrument_cls()
778 config.applyConfigOverrides(
779 instrument,
780 getattr(taskClass, "_DefaultName", ""),
781 taskIR.config,
782 self._pipelineIR.parameters,
783 label,
784 )
785 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)
787 def __iter__(self) -> Generator[TaskDef, None, None]:
788 return self.toExpandedPipeline()
790 def __getitem__(self, item: str) -> TaskDef:
791 return self._buildTaskDef(item)
793 def __len__(self) -> int:
794 return len(self._pipelineIR.tasks)
796 def __eq__(self, other: object) -> bool:
797 if not isinstance(other, Pipeline):
798 return False
799 elif self._pipelineIR == other._pipelineIR:
800 # Shortcut: if the IR is the same, the expanded pipeline must be
801 # the same as well. But the converse is not true.
802 return True
803 else:
804 self_expanded = {td.label: (td.taskClass,) for td in self}
805 other_expanded = {td.label: (td.taskClass,) for td in other}
806 if self_expanded != other_expanded:
807 return False
808 # After DM-27847, we should compare configuration here, or better,
809 # delegated to TaskDef.__eq__ after making that compare configurations.
810 raise NotImplementedError(
811 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847."
812 )
815@dataclass(frozen=True)
816class TaskDatasetTypes:
817 """An immutable struct that extracts and classifies the dataset types used
818 by a `PipelineTask`
819 """
821 initInputs: NamedValueSet[DatasetType]
822 """Dataset types that are needed as inputs in order to construct this Task.
824 Task-level `initInputs` may be classified as either
825 `~PipelineDatasetTypes.initInputs` or
826 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
827 """
829 initOutputs: NamedValueSet[DatasetType]
830 """Dataset types that may be written after constructing this Task.
832 Task-level `initOutputs` may be classified as either
833 `~PipelineDatasetTypes.initOutputs` or
834 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
835 """
837 inputs: NamedValueSet[DatasetType]
838 """Dataset types that are regular inputs to this Task.
840 If an input dataset needed for a Quantum cannot be found in the input
841 collection(s) or produced by another Task in the Pipeline, that Quantum
842 (and all dependent Quanta) will not be produced.
844 Task-level `inputs` may be classified as either
845 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
846 at the Pipeline level.
847 """
849 queryConstraints: NamedValueSet[DatasetType]
850 """Regular inputs that should not be used as constraints on the initial
851 QuantumGraph generation data ID query, according to their tasks
852 (`NamedValueSet`).
853 """
855 prerequisites: NamedValueSet[DatasetType]
856 """Dataset types that are prerequisite inputs to this Task.
858 Prerequisite inputs must exist in the input collection(s) before the
859 pipeline is run, but do not constrain the graph - if a prerequisite is
860 missing for a Quantum, `PrerequisiteMissingError` is raised.
862 Prerequisite inputs are not resolved until the second stage of
863 QuantumGraph generation.
864 """
866 outputs: NamedValueSet[DatasetType]
867 """Dataset types that are produced by this Task.
869 Task-level `outputs` may be classified as either
870 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
871 at the Pipeline level.
872 """
874 @classmethod
875 def fromTaskDef(
876 cls,
877 taskDef: TaskDef,
878 *,
879 registry: Registry,
880 include_configs: bool = True,
881 storage_class_mapping: Optional[Mapping[str, str]] = None,
882 ) -> TaskDatasetTypes:
883 """Extract and classify the dataset types from a single `PipelineTask`.
885 Parameters
886 ----------
887 taskDef: `TaskDef`
888 An instance of a `TaskDef` class for a particular `PipelineTask`.
889 registry: `Registry`
890 Registry used to construct normalized `DatasetType` objects and
891 retrieve those that are incomplete.
892 include_configs : `bool`, optional
893 If `True` (default) include config dataset types as
894 ``initOutputs``.
895 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional
896 If a taskdef contains a component dataset type that is unknown
897 to the registry, its parent StorageClass will be looked up in this
898 mapping if it is supplied. If the mapping does not contain the
899 composite dataset type, or the mapping is not supplied an exception
900 will be raised.
902 Returns
903 -------
904 types: `TaskDatasetTypes`
905 The dataset types used by this task.
907 Raises
908 ------
909 ValueError
910 Raised if dataset type connection definition differs from
911 registry definition.
912 LookupError
913 Raised if component parent StorageClass could not be determined
914 and storage_class_mapping does not contain the composite type, or
915 is set to None.
916 """
918 def makeDatasetTypesSet(
919 connectionType: str,
920 is_input: bool,
921 freeze: bool = True,
922 ) -> NamedValueSet[DatasetType]:
923 """Constructs a set of true `DatasetType` objects
925 Parameters
926 ----------
927 connectionType : `str`
928 Name of the connection type to produce a set for, corresponds
929 to an attribute of type `list` on the connection class instance
930 is_input : `bool`
931 These are input dataset types, else they are output dataset
932 types.
933 freeze : `bool`, optional
934 If `True`, call `NamedValueSet.freeze` on the object returned.
936 Returns
937 -------
938 datasetTypes : `NamedValueSet`
939 A set of all datasetTypes which correspond to the input
940 connection type specified in the connection class of this
941 `PipelineTask`
943 Raises
944 ------
945 ValueError
946 Raised if dataset type connection definition differs from
947 registry definition.
948 LookupError
949 Raised if component parent StorageClass could not be determined
950 and storage_class_mapping does not contain the composite type,
951 or is set to None.
953 Notes
954 -----
955 This function is a closure over the variables ``registry`` and
956 ``taskDef``, and ``storage_class_mapping``.
957 """
958 datasetTypes = NamedValueSet[DatasetType]()
959 for c in iterConnections(taskDef.connections, connectionType):
960 dimensions = set(getattr(c, "dimensions", set()))
961 if "skypix" in dimensions:
962 try:
963 datasetType = registry.getDatasetType(c.name)
964 except LookupError as err:
965 raise LookupError(
966 f"DatasetType '{c.name}' referenced by "
967 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
968 "placeholder, but does not already exist in the registry. "
969 "Note that reference catalog names are now used as the dataset "
970 "type name instead of 'ref_cat'."
971 ) from err
972 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
973 rest2 = set(
974 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension)
975 )
976 if rest1 != rest2:
977 raise ValueError(
978 f"Non-skypix dimensions for dataset type {c.name} declared in "
979 f"connections ({rest1}) are inconsistent with those in "
980 f"registry's version of this dataset ({rest2})."
981 )
982 else:
983 # Component dataset types are not explicitly in the
984 # registry. This complicates consistency checks with
985 # registry and requires we work out the composite storage
986 # class.
987 registryDatasetType = None
988 try:
989 registryDatasetType = registry.getDatasetType(c.name)
990 except KeyError:
991 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
992 if componentName:
993 if storage_class_mapping is None or compositeName not in storage_class_mapping:
994 raise LookupError(
995 "Component parent class cannot be determined, and "
996 "composite name was not in storage class mapping, or no "
997 "storage_class_mapping was supplied"
998 )
999 else:
1000 parentStorageClass = storage_class_mapping[compositeName]
1001 else:
1002 parentStorageClass = None
1003 datasetType = c.makeDatasetType(
1004 registry.dimensions, parentStorageClass=parentStorageClass
1005 )
1006 registryDatasetType = datasetType
1007 else:
1008 datasetType = c.makeDatasetType(
1009 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass
1010 )
1012 if registryDatasetType and datasetType != registryDatasetType:
1013 # The dataset types differ but first check to see if
1014 # they are compatible before raising.
1015 if is_input:
1016 # This DatasetType must be compatible on get.
1017 is_compatible = datasetType.is_compatible_with(registryDatasetType)
1018 else:
1019 # Has to be able to be converted to expect type
1020 # on put.
1021 is_compatible = registryDatasetType.is_compatible_with(datasetType)
1022 if is_compatible:
1023 # For inputs we want the pipeline to use the
1024 # pipeline definition, for outputs it should use
1025 # the registry definition.
1026 if not is_input:
1027 datasetType = registryDatasetType
1028 _LOG.debug(
1029 "Dataset types differ (task %s != registry %s) but are compatible"
1030 " for %s in %s.",
1031 datasetType,
1032 registryDatasetType,
1033 "input" if is_input else "output",
1034 taskDef.label,
1035 )
1036 else:
1037 try:
1038 # Explicitly check for storage class just to
1039 # make more specific message.
1040 _ = datasetType.storageClass
1041 except KeyError:
1042 raise ValueError(
1043 "Storage class does not exist for supplied dataset type "
1044 f"{datasetType} for {taskDef.label}."
1045 ) from None
1046 raise ValueError(
1047 f"Supplied dataset type ({datasetType}) inconsistent with "
1048 f"registry definition ({registryDatasetType}) "
1049 f"for {taskDef.label}."
1050 )
1051 datasetTypes.add(datasetType)
1052 if freeze:
1053 datasetTypes.freeze()
1054 return datasetTypes
1056 # optionally add initOutput dataset for config
1057 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False)
1058 if include_configs:
1059 initOutputs.add(
1060 DatasetType(
1061 taskDef.configDatasetName,
1062 registry.dimensions.empty,
1063 storageClass="Config",
1064 )
1065 )
1066 initOutputs.freeze()
1068 # optionally add output dataset for metadata
1069 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False)
1070 if taskDef.metadataDatasetName is not None:
1071 # Metadata is supposed to be of the TaskMetadata type, its
1072 # dimensions correspond to a task quantum.
1073 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
1075 # Allow the storage class definition to be read from the existing
1076 # dataset type definition if present.
1077 try:
1078 current = registry.getDatasetType(taskDef.metadataDatasetName)
1079 except KeyError:
1080 # No previous definition so use the default.
1081 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet"
1082 else:
1083 storageClass = current.storageClass.name
1085 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)})
1086 if taskDef.logOutputDatasetName is not None:
1087 # Log output dimensions correspond to a task quantum.
1088 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
1089 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")})
1091 outputs.freeze()
1093 inputs = makeDatasetTypesSet("inputs", is_input=True)
1094 queryConstraints = NamedValueSet(
1095 inputs[c.name]
1096 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs"))
1097 if not c.deferGraphConstraint
1098 )
1100 return cls(
1101 initInputs=makeDatasetTypesSet("initInputs", is_input=True),
1102 initOutputs=initOutputs,
1103 inputs=inputs,
1104 queryConstraints=queryConstraints,
1105 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True),
1106 outputs=outputs,
1107 )
1110@dataclass(frozen=True)
1111class PipelineDatasetTypes:
1112 """An immutable struct that classifies the dataset types used in a
1113 `Pipeline`.
1114 """
1116 packagesDatasetName: ClassVar[str] = "packages"
1117 """Name of a dataset type used to save package versions.
1118 """
1120 initInputs: NamedValueSet[DatasetType]
1121 """Dataset types that are needed as inputs in order to construct the Tasks
1122 in this Pipeline.
1124 This does not include dataset types that are produced when constructing
1125 other Tasks in the Pipeline (these are classified as `initIntermediates`).
1126 """
1128 initOutputs: NamedValueSet[DatasetType]
1129 """Dataset types that may be written after constructing the Tasks in this
1130 Pipeline.
1132 This does not include dataset types that are also used as inputs when
1133 constructing other Tasks in the Pipeline (these are classified as
1134 `initIntermediates`).
1135 """
1137 initIntermediates: NamedValueSet[DatasetType]
1138 """Dataset types that are both used when constructing one or more Tasks
1139 in the Pipeline and produced as a side-effect of constructing another
1140 Task in the Pipeline.
1141 """
1143 inputs: NamedValueSet[DatasetType]
1144 """Dataset types that are regular inputs for the full pipeline.
1146 If an input dataset needed for a Quantum cannot be found in the input
1147 collection(s), that Quantum (and all dependent Quanta) will not be
1148 produced.
1149 """
1151 queryConstraints: NamedValueSet[DatasetType]
1152 """Regular inputs that should be used as constraints on the initial
1153 QuantumGraph generation data ID query, according to their tasks
1154 (`NamedValueSet`).
1155 """
1157 prerequisites: NamedValueSet[DatasetType]
1158 """Dataset types that are prerequisite inputs for the full Pipeline.
1160 Prerequisite inputs must exist in the input collection(s) before the
1161 pipeline is run, but do not constrain the graph - if a prerequisite is
1162 missing for a Quantum, `PrerequisiteMissingError` is raised.
1164 Prerequisite inputs are not resolved until the second stage of
1165 QuantumGraph generation.
1166 """
1168 intermediates: NamedValueSet[DatasetType]
1169 """Dataset types that are output by one Task in the Pipeline and consumed
1170 as inputs by one or more other Tasks in the Pipeline.
1171 """
1173 outputs: NamedValueSet[DatasetType]
1174 """Dataset types that are output by a Task in the Pipeline and not consumed
1175 by any other Task in the Pipeline.
1176 """
1178 byTask: Mapping[str, TaskDatasetTypes]
1179 """Per-Task dataset types, keyed by label in the `Pipeline`.
1181 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
1182 neither has been modified since the dataset types were extracted, of
1183 course).
1184 """
1186 @classmethod
1187 def fromPipeline(
1188 cls,
1189 pipeline: Union[Pipeline, Iterable[TaskDef]],
1190 *,
1191 registry: Registry,
1192 include_configs: bool = True,
1193 include_packages: bool = True,
1194 ) -> PipelineDatasetTypes:
1195 """Extract and classify the dataset types from all tasks in a
1196 `Pipeline`.
1198 Parameters
1199 ----------
1200 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1201 A collection of tasks that can be run together.
1202 registry: `Registry`
1203 Registry used to construct normalized `DatasetType` objects and
1204 retrieve those that are incomplete.
1205 include_configs : `bool`, optional
1206 If `True` (default) include config dataset types as
1207 ``initOutputs``.
1208 include_packages : `bool`, optional
1209 If `True` (default) include the dataset type for software package
1210 versions in ``initOutputs``.
1212 Returns
1213 -------
1214 types: `PipelineDatasetTypes`
1215 The dataset types used by this `Pipeline`.
1217 Raises
1218 ------
1219 ValueError
1220 Raised if Tasks are inconsistent about which datasets are marked
1221 prerequisite. This indicates that the Tasks cannot be run as part
1222 of the same `Pipeline`.
1223 """
1224 allInputs = NamedValueSet[DatasetType]()
1225 allOutputs = NamedValueSet[DatasetType]()
1226 allInitInputs = NamedValueSet[DatasetType]()
1227 allInitOutputs = NamedValueSet[DatasetType]()
1228 prerequisites = NamedValueSet[DatasetType]()
1229 queryConstraints = NamedValueSet[DatasetType]()
1230 byTask = dict()
1231 if include_packages:
1232 allInitOutputs.add(
1233 DatasetType(
1234 cls.packagesDatasetName,
1235 registry.dimensions.empty,
1236 storageClass="Packages",
1237 )
1238 )
1239 # create a list of TaskDefs in case the input is a generator
1240 pipeline = list(pipeline)
1242 # collect all the output dataset types
1243 typeStorageclassMap: Dict[str, str] = {}
1244 for taskDef in pipeline:
1245 for outConnection in iterConnections(taskDef.connections, "outputs"):
1246 typeStorageclassMap[outConnection.name] = outConnection.storageClass
1248 for taskDef in pipeline:
1249 thisTask = TaskDatasetTypes.fromTaskDef(
1250 taskDef,
1251 registry=registry,
1252 include_configs=include_configs,
1253 storage_class_mapping=typeStorageclassMap,
1254 )
1255 allInitInputs.update(thisTask.initInputs)
1256 allInitOutputs.update(thisTask.initOutputs)
1257 allInputs.update(thisTask.inputs)
1258 # Inputs are query constraints if any task considers them a query
1259 # constraint.
1260 queryConstraints.update(thisTask.queryConstraints)
1261 prerequisites.update(thisTask.prerequisites)
1262 allOutputs.update(thisTask.outputs)
1263 byTask[taskDef.label] = thisTask
1264 if not prerequisites.isdisjoint(allInputs):
1265 raise ValueError(
1266 "{} marked as both prerequisites and regular inputs".format(
1267 {dt.name for dt in allInputs & prerequisites}
1268 )
1269 )
1270 if not prerequisites.isdisjoint(allOutputs):
1271 raise ValueError(
1272 "{} marked as both prerequisites and outputs".format(
1273 {dt.name for dt in allOutputs & prerequisites}
1274 )
1275 )
1276 # Make sure that components which are marked as inputs get treated as
1277 # intermediates if there is an output which produces the composite
1278 # containing the component
1279 intermediateComponents = NamedValueSet[DatasetType]()
1280 intermediateComposites = NamedValueSet[DatasetType]()
1281 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
1282 for dsType in allInputs:
1283 # get the name of a possible component
1284 name, component = dsType.nameAndComponent()
1285 # if there is a component name, that means this is a component
1286 # DatasetType, if there is an output which produces the parent of
1287 # this component, treat this input as an intermediate
1288 if component is not None:
1289 # This needs to be in this if block, because someone might have
1290 # a composite that is a pure input from existing data
1291 if name in outputNameMapping:
1292 intermediateComponents.add(dsType)
1293 intermediateComposites.add(outputNameMapping[name])
1295 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None:
1296 common = a.names & b.names
1297 for name in common:
1298 # Any compatibility is allowed. This function does not know
1299 # if a dataset type is to be used for input or output.
1300 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])):
1301 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
1303 checkConsistency(allInitInputs, allInitOutputs)
1304 checkConsistency(allInputs, allOutputs)
1305 checkConsistency(allInputs, intermediateComposites)
1306 checkConsistency(allOutputs, intermediateComposites)
1308 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]:
1309 assert isinstance(s, NamedValueSet)
1310 s.freeze()
1311 return s
1313 inputs = frozen(allInputs - allOutputs - intermediateComponents)
1315 return cls(
1316 initInputs=frozen(allInitInputs - allInitOutputs),
1317 initIntermediates=frozen(allInitInputs & allInitOutputs),
1318 initOutputs=frozen(allInitOutputs - allInitInputs),
1319 inputs=inputs,
1320 queryConstraints=frozen(queryConstraints & inputs),
1321 # If there are storage class differences in inputs and outputs
1322 # the intermediates have to choose priority. Here choose that
1323 # inputs to tasks much match the requested storage class by
1324 # applying the inputs over the top of the outputs.
1325 intermediates=frozen(allOutputs & allInputs | intermediateComponents),
1326 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1327 prerequisites=frozen(prerequisites),
1328 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1329 )
1331 @classmethod
1332 def initOutputNames(
1333 cls,
1334 pipeline: Union[Pipeline, Iterable[TaskDef]],
1335 *,
1336 include_configs: bool = True,
1337 include_packages: bool = True,
1338 ) -> Iterator[str]:
1339 """Return the names of dataset types ot task initOutputs, Configs,
1340 and package versions for a pipeline.
1342 Parameters
1343 ----------
1344 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1345 A `Pipeline` instance or collection of `TaskDef` instances.
1346 include_configs : `bool`, optional
1347 If `True` (default) include config dataset types.
1348 include_packages : `bool`, optional
1349 If `True` (default) include the dataset type for package versions.
1351 Yields
1352 ------
1353 datasetTypeName : `str`
1354 Name of the dataset type.
1355 """
1356 if include_packages:
1357 # Package versions dataset type
1358 yield cls.packagesDatasetName
1360 if isinstance(pipeline, Pipeline):
1361 pipeline = pipeline.toExpandedPipeline()
1363 for taskDef in pipeline:
1364 # all task InitOutputs
1365 for name in taskDef.connections.initOutputs:
1366 attribute = getattr(taskDef.connections, name)
1367 yield attribute.name
1369 # config dataset name
1370 if include_configs:
1371 yield taskDef.configDatasetName