Coverage for python/lsst/pipe/base/pipeline.py: 19%
441 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-14 02:24 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-14 02:24 -0800
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining Pipeline class and related methods.
24"""
26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
28import copy
29import logging
30import os
31import re
32import urllib.parse
34# -------------------------------
35# Imports of standard modules --
36# -------------------------------
37from dataclasses import dataclass
38from types import MappingProxyType
39from typing import (
40 TYPE_CHECKING,
41 AbstractSet,
42 Callable,
43 ClassVar,
44 Dict,
45 Generator,
46 Iterable,
47 Iterator,
48 Mapping,
49 Optional,
50 Set,
51 Tuple,
52 Type,
53 Union,
54 cast,
55)
57# -----------------------------
58# Imports for other modules --
59from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension
60from lsst.resources import ResourcePath, ResourcePathExpression
61from lsst.utils import doImportType
62from lsst.utils.introspection import get_full_type_name
64from . import pipelineIR, pipeTools
65from ._task_metadata import TaskMetadata
66from .config import PipelineTaskConfig
67from .configOverrides import ConfigOverrides
68from .connections import iterConnections
69from .pipelineTask import PipelineTask
70from .task import _TASK_METADATA_TYPE
72if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true
73 from lsst.obs.base import Instrument
74 from lsst.pex.config import Config
76# ----------------------------------
77# Local non-exported definitions --
78# ----------------------------------
80_LOG = logging.getLogger(__name__)
82# ------------------------
83# Exported definitions --
84# ------------------------
87@dataclass
88class LabelSpecifier:
89 """A structure to specify a subset of labels to load
91 This structure may contain a set of labels to be used in subsetting a
92 pipeline, or a beginning and end point. Beginning or end may be empty,
93 in which case the range will be a half open interval. Unlike python
94 iteration bounds, end bounds are *INCLUDED*. Note that range based
95 selection is not well defined for pipelines that are not linear in nature,
96 and correct behavior is not guaranteed, or may vary from run to run.
97 """
99 labels: Optional[Set[str]] = None
100 begin: Optional[str] = None
101 end: Optional[str] = None
103 def __post_init__(self) -> None:
104 if self.labels is not None and (self.begin or self.end):
105 raise ValueError(
106 "This struct can only be initialized with a labels set or a begin (and/or) end specifier"
107 )
110class TaskDef:
111 """TaskDef is a collection of information about task needed by Pipeline.
113 The information includes task name, configuration object and optional
114 task class. This class is just a collection of attributes and it exposes
115 all of them so that attributes could potentially be modified in place
116 (e.g. if configuration needs extra overrides).
118 Attributes
119 ----------
120 taskName : `str`, optional
121 The fully-qualified `PipelineTask` class name. If not provided,
122 ``taskClass`` must be.
123 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional
124 Instance of the configuration class corresponding to this task class,
125 usually with all overrides applied. This config will be frozen. If
126 not provided, ``taskClass`` must be provided and
127 ``taskClass.ConfigClass()`` will be used.
128 taskClass : `type`, optional
129 `PipelineTask` class object; if provided and ``taskName`` is as well,
130 the caller guarantees that they are consistent. If not provided,
131 ``taskName`` is used to import the type.
132 label : `str`, optional
133 Task label, usually a short string unique in a pipeline. If not
134 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will
135 be used.
136 """
138 def __init__(
139 self,
140 taskName: Optional[str] = None,
141 config: Optional[PipelineTaskConfig] = None,
142 taskClass: Optional[Type[PipelineTask]] = None,
143 label: Optional[str] = None,
144 ):
145 if taskName is None:
146 if taskClass is None:
147 raise ValueError("At least one of `taskName` and `taskClass` must be provided.")
148 taskName = get_full_type_name(taskClass)
149 elif taskClass is None:
150 taskClass = doImportType(taskName)
151 if config is None:
152 if taskClass is None:
153 raise ValueError("`taskClass` must be provided if `config` is not.")
154 config = taskClass.ConfigClass()
155 if label is None:
156 if taskClass is None:
157 raise ValueError("`taskClass` must be provided if `label` is not.")
158 label = taskClass._DefaultName
159 self.taskName = taskName
160 try:
161 config.validate()
162 except Exception:
163 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
164 raise
165 config.freeze()
166 self.config = config
167 self.taskClass = taskClass
168 self.label = label
169 self.connections = config.connections.ConnectionsClass(config=config)
171 @property
172 def configDatasetName(self) -> str:
173 """Name of a dataset type for configuration of this task (`str`)"""
174 return self.label + "_config"
176 @property
177 def metadataDatasetName(self) -> Optional[str]:
178 """Name of a dataset type for metadata of this task, `None` if
179 metadata is not to be saved (`str`)
180 """
181 if self.config.saveMetadata:
182 return self.makeMetadataDatasetName(self.label)
183 else:
184 return None
186 @classmethod
187 def makeMetadataDatasetName(cls, label: str) -> str:
188 """Construct the name of the dataset type for metadata for a task.
190 Parameters
191 ----------
192 label : `str`
193 Label for the task within its pipeline.
195 Returns
196 -------
197 name : `str`
198 Name of the task's metadata dataset type.
199 """
200 return f"{label}_metadata"
202 @property
203 def logOutputDatasetName(self) -> Optional[str]:
204 """Name of a dataset type for log output from this task, `None` if
205 logs are not to be saved (`str`)
206 """
207 if cast(PipelineTaskConfig, self.config).saveLogOutput:
208 return self.label + "_log"
209 else:
210 return None
212 def __str__(self) -> str:
213 rep = "TaskDef(" + self.taskName
214 if self.label:
215 rep += ", label=" + self.label
216 rep += ")"
217 return rep
219 def __eq__(self, other: object) -> bool:
220 if not isinstance(other, TaskDef):
221 return False
222 # This does not consider equality of configs when determining equality
223 # as config equality is a difficult thing to define. Should be updated
224 # after DM-27847
225 return self.taskClass == other.taskClass and self.label == other.label
227 def __hash__(self) -> int:
228 return hash((self.taskClass, self.label))
230 @classmethod
231 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef:
232 """Custom callable for unpickling.
234 All arguments are forwarded directly to the constructor; this
235 trampoline is only needed because ``__reduce__`` callables can't be
236 called with keyword arguments.
237 """
238 return cls(taskName=taskName, config=config, label=label)
240 def __reduce__(self) -> Tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], Tuple[str, Config, str]]:
241 return (self._unreduce, (self.taskName, self.config, self.label))
244class Pipeline:
245 """A `Pipeline` is a representation of a series of tasks to run, and the
246 configuration for those tasks.
248 Parameters
249 ----------
250 description : `str`
251 A description of that this pipeline does.
252 """
254 def __init__(self, description: str):
255 pipeline_dict = {"description": description, "tasks": {}}
256 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
258 @classmethod
259 def fromFile(cls, filename: str) -> Pipeline:
260 """Load a pipeline defined in a pipeline yaml file.
262 Parameters
263 ----------
264 filename: `str`
265 A path that points to a pipeline defined in yaml format. This
266 filename may also supply additional labels to be used in
267 subsetting the loaded Pipeline. These labels are separated from
268 the path by a \\#, and may be specified as a comma separated
269 list, or a range denoted as beginning..end. Beginning or end may
270 be empty, in which case the range will be a half open interval.
271 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
272 that range based selection is not well defined for pipelines that
273 are not linear in nature, and correct behavior is not guaranteed,
274 or may vary from run to run.
276 Returns
277 -------
278 pipeline: `Pipeline`
279 The pipeline loaded from specified location with appropriate (if
280 any) subsetting
282 Notes
283 -----
284 This method attempts to prune any contracts that contain labels which
285 are not in the declared subset of labels. This pruning is done using a
286 string based matching due to the nature of contracts and may prune more
287 than it should.
288 """
289 return cls.from_uri(filename)
291 @classmethod
292 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline:
293 """Load a pipeline defined in a pipeline yaml file at a location
294 specified by a URI.
296 Parameters
297 ----------
298 uri: convertible to `ResourcePath`
299 If a string is supplied this should be a URI path that points to a
300 pipeline defined in yaml format, either as a direct path to the
301 yaml file, or as a directory containing a "pipeline.yaml" file (the
302 form used by `write_to_uri` with ``expand=True``). This uri may
303 also supply additional labels to be used in subsetting the loaded
304 Pipeline. These labels are separated from the path by a \\#, and
305 may be specified as a comma separated list, or a range denoted as
306 beginning..end. Beginning or end may be empty, in which case the
307 range will be a half open interval. Unlike python iteration bounds,
308 end bounds are *INCLUDED*. Note that range based selection is not
309 well defined for pipelines that are not linear in nature, and
310 correct behavior is not guaranteed, or may vary from run to run.
311 The same specifiers can be used with a `ResourcePath` object, by
312 being the sole contents in the fragments attribute.
314 Returns
315 -------
316 pipeline: `Pipeline`
317 The pipeline loaded from specified location with appropriate (if
318 any) subsetting
320 Notes
321 -----
322 This method attempts to prune any contracts that contain labels which
323 are not in the declared subset of labels. This pruning is done using a
324 string based matching due to the nature of contracts and may prune more
325 than it should.
326 """
327 # Split up the uri and any labels that were supplied
328 uri, label_specifier = cls._parse_file_specifier(uri)
329 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
331 # If there are labels supplied, only keep those
332 if label_specifier is not None:
333 pipeline = pipeline.subsetFromLabels(label_specifier)
334 return pipeline
336 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
337 """Subset a pipeline to contain only labels specified in labelSpecifier
339 Parameters
340 ----------
341 labelSpecifier : `labelSpecifier`
342 Object containing labels that describes how to subset a pipeline.
344 Returns
345 -------
346 pipeline : `Pipeline`
347 A new pipeline object that is a subset of the old pipeline
349 Raises
350 ------
351 ValueError
352 Raised if there is an issue with specified labels
354 Notes
355 -----
356 This method attempts to prune any contracts that contain labels which
357 are not in the declared subset of labels. This pruning is done using a
358 string based matching due to the nature of contracts and may prune more
359 than it should.
360 """
361 # Labels supplied as a set
362 if labelSpecifier.labels:
363 labelSet = labelSpecifier.labels
364 # Labels supplied as a range, first create a list of all the labels
365 # in the pipeline sorted according to task dependency. Then only
366 # keep labels that lie between the supplied bounds
367 else:
368 # Create a copy of the pipeline to use when assessing the label
369 # ordering. Use a dict for fast searching while preserving order.
370 # Remove contracts so they do not fail in the expansion step. This
371 # is needed because a user may only configure the tasks they intend
372 # to run, which may cause some contracts to fail if they will later
373 # be dropped
374 pipeline = copy.deepcopy(self)
375 pipeline._pipelineIR.contracts = []
376 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
378 # Verify the bounds are in the labels
379 if labelSpecifier.begin is not None:
380 if labelSpecifier.begin not in labels:
381 raise ValueError(
382 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition"
383 )
384 if labelSpecifier.end is not None:
385 if labelSpecifier.end not in labels:
386 raise ValueError(
387 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition"
388 )
390 labelSet = set()
391 for label in labels:
392 if labelSpecifier.begin is not None:
393 if label != labelSpecifier.begin:
394 continue
395 else:
396 labelSpecifier.begin = None
397 labelSet.add(label)
398 if labelSpecifier.end is not None and label == labelSpecifier.end:
399 break
400 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
402 @staticmethod
403 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]:
404 """Split appart a uri and any possible label subsets"""
405 if isinstance(uri, str):
406 # This is to support legacy pipelines during transition
407 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
408 if num_replace:
409 raise ValueError(
410 f"The pipeline file {uri} seems to use the legacy :"
411 " to separate labels, please use # instead."
412 )
413 if uri.count("#") > 1:
414 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
415 # Everything else can be converted directly to ResourcePath.
416 uri = ResourcePath(uri)
417 label_subset = uri.fragment or None
419 specifier: Optional[LabelSpecifier]
420 if label_subset is not None:
421 label_subset = urllib.parse.unquote(label_subset)
422 args: Dict[str, Union[Set[str], str, None]]
423 # labels supplied as a list
424 if "," in label_subset:
425 if ".." in label_subset:
426 raise ValueError(
427 "Can only specify a list of labels or a rangewhen loading a Pipline not both"
428 )
429 args = {"labels": set(label_subset.split(","))}
430 # labels supplied as a range
431 elif ".." in label_subset:
432 # Try to de-structure the labelSubset, this will fail if more
433 # than one range is specified
434 begin, end, *rest = label_subset.split("..")
435 if rest:
436 raise ValueError("Only one range can be specified when loading a pipeline")
437 args = {"begin": begin if begin else None, "end": end if end else None}
438 # Assume anything else is a single label
439 else:
440 args = {"labels": {label_subset}}
442 # MyPy doesn't like how cavalier kwarg construction is with types.
443 specifier = LabelSpecifier(**args) # type: ignore
444 else:
445 specifier = None
447 return uri, specifier
449 @classmethod
450 def fromString(cls, pipeline_string: str) -> Pipeline:
451 """Create a pipeline from string formatted as a pipeline document.
453 Parameters
454 ----------
455 pipeline_string : `str`
456 A string that is formatted according like a pipeline document
458 Returns
459 -------
460 pipeline: `Pipeline`
461 """
462 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
463 return pipeline
465 @classmethod
466 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
467 """Create a pipeline from an already created `PipelineIR` object.
469 Parameters
470 ----------
471 deserialized_pipeline: `PipelineIR`
472 An already created pipeline intermediate representation object
474 Returns
475 -------
476 pipeline: `Pipeline`
477 """
478 pipeline = cls.__new__(cls)
479 pipeline._pipelineIR = deserialized_pipeline
480 return pipeline
482 @classmethod
483 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline:
484 """Create a new pipeline by copying an already existing `Pipeline`.
486 Parameters
487 ----------
488 pipeline: `Pipeline`
489 An already created pipeline intermediate representation object
491 Returns
492 -------
493 pipeline: `Pipeline`
494 """
495 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
497 def __str__(self) -> str:
498 return str(self._pipelineIR)
500 def mergePipeline(self, pipeline: Pipeline) -> None:
501 """Merge another in-memory `Pipeline` object into this one.
503 This merges another pipeline into this object, as if it were declared
504 in the import block of the yaml definition of this pipeline. This
505 modifies this pipeline in place.
507 Parameters
508 ----------
509 pipeline : `Pipeline`
510 The `Pipeline` object that is to be merged into this object.
511 """
512 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,))
514 def addLabelToSubset(self, subset: str, label: str) -> None:
515 """Add a task label from the specified subset.
517 Parameters
518 ----------
519 subset : `str`
520 The labeled subset to modify
521 label : `str`
522 The task label to add to the specified subset.
524 Raises
525 ------
526 ValueError
527 Raised if the specified subset does not exist within the pipeline.
528 Raised if the specified label does not exist within the pipeline.
529 """
530 if label not in self._pipelineIR.tasks:
531 raise ValueError(f"Label {label} does not appear within the pipeline")
532 if subset not in self._pipelineIR.labeled_subsets:
533 raise ValueError(f"Subset {subset} does not appear within the pipeline")
534 self._pipelineIR.labeled_subsets[subset].subset.add(label)
536 def removeLabelFromSubset(self, subset: str, label: str) -> None:
537 """Remove a task label from the specified subset.
539 Parameters
540 ----------
541 subset : `str`
542 The labeled subset to modify
543 label : `str`
544 The task label to remove from the specified subset.
546 Raises
547 ------
548 ValueError
549 Raised if the specified subset does not exist in the pipeline.
550 Raised if the specified label does not exist within the specified
551 subset.
552 """
553 if subset not in self._pipelineIR.labeled_subsets:
554 raise ValueError(f"Subset {subset} does not appear within the pipeline")
555 if label not in self._pipelineIR.labeled_subsets[subset].subset:
556 raise ValueError(f"Label {label} does not appear within the pipeline")
557 self._pipelineIR.labeled_subsets[subset].subset.remove(label)
559 def findSubsetsWithLabel(self, label: str) -> set[str]:
560 """Find any subsets which may contain the specified label.
562 This function returns the name of subsets which return the specified
563 label. May return an empty set if there are no subsets, or no subsets
564 containing the specified label.
566 Parameters
567 ----------
568 label : `str`
569 The task label to use in membership check
571 Returns
572 -------
573 subsets : `set` of `str`
574 Returns a set (possibly empty) of subsets names which contain the
575 specified label.
577 Raises
578 ------
579 ValueError
580 Raised if the specified label does not exist within this pipeline.
581 """
582 results = set()
583 if label not in self._pipelineIR.tasks:
584 raise ValueError(f"Label {label} does not appear within the pipeline")
585 for subset in self._pipelineIR.labeled_subsets.values():
586 if label in subset.subset:
587 results.add(subset.label)
588 return results
590 def addInstrument(self, instrument: Union[Instrument, str]) -> None:
591 """Add an instrument to the pipeline, or replace an instrument that is
592 already defined.
594 Parameters
595 ----------
596 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
597 Either a derived class object of a `lsst.daf.butler.instrument` or
598 a string corresponding to a fully qualified
599 `lsst.daf.butler.instrument` name.
600 """
601 if isinstance(instrument, str):
602 pass
603 else:
604 # TODO: assume that this is a subclass of Instrument, no type
605 # checking
606 instrument = get_full_type_name(instrument)
607 self._pipelineIR.instrument = instrument
609 def getInstrument(self) -> Optional[str]:
610 """Get the instrument from the pipeline.
612 Returns
613 -------
614 instrument : `str`, or None
615 The fully qualified name of a `lsst.obs.base.Instrument` subclass,
616 name, or None if the pipeline does not have an instrument.
617 """
618 return self._pipelineIR.instrument
620 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None:
621 """Add a new task to the pipeline, or replace a task that is already
622 associated with the supplied label.
624 Parameters
625 ----------
626 task: `PipelineTask` or `str`
627 Either a derived class object of a `PipelineTask` or a string
628 corresponding to a fully qualified `PipelineTask` name.
629 label: `str`
630 A label that is used to identify the `PipelineTask` being added
631 """
632 if isinstance(task, str):
633 taskName = task
634 elif issubclass(task, PipelineTask):
635 taskName = get_full_type_name(task)
636 else:
637 raise ValueError(
638 "task must be either a child class of PipelineTask or a string containing"
639 " a fully qualified name to one"
640 )
641 if not label:
642 # in some cases (with command line-generated pipeline) tasks can
643 # be defined without label which is not acceptable, use task
644 # _DefaultName in that case
645 if isinstance(task, str):
646 task_class = doImportType(task)
647 label = task_class._DefaultName
648 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
650 def removeTask(self, label: str) -> None:
651 """Remove a task from the pipeline.
653 Parameters
654 ----------
655 label : `str`
656 The label used to identify the task that is to be removed
658 Raises
659 ------
660 KeyError
661 If no task with that label exists in the pipeline
663 """
664 self._pipelineIR.tasks.pop(label)
666 def addConfigOverride(self, label: str, key: str, value: object) -> None:
667 """Apply single config override.
669 Parameters
670 ----------
671 label : `str`
672 Label of the task.
673 key: `str`
674 Fully-qualified field name.
675 value : object
676 Value to be given to a field.
677 """
678 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
680 def addConfigFile(self, label: str, filename: str) -> None:
681 """Add overrides from a specified file.
683 Parameters
684 ----------
685 label : `str`
686 The label used to identify the task associated with config to
687 modify
688 filename : `str`
689 Path to the override file.
690 """
691 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
693 def addConfigPython(self, label: str, pythonString: str) -> None:
694 """Add Overrides by running a snippet of python code against a config.
696 Parameters
697 ----------
698 label : `str`
699 The label used to identity the task associated with config to
700 modify.
701 pythonString: `str`
702 A string which is valid python code to be executed. This is done
703 with config as the only local accessible value.
704 """
705 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
707 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
708 if label == "parameters":
709 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys():
710 raise ValueError("Cannot override parameters that are not defined in pipeline")
711 self._pipelineIR.parameters.mapping.update(newConfig.rest)
712 if newConfig.file:
713 raise ValueError("Setting parameters section with config file is not supported")
714 if newConfig.python:
715 raise ValueError("Setting parameters section using python block in unsupported")
716 return
717 if label not in self._pipelineIR.tasks:
718 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
719 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
721 def write_to_uri(self, uri: ResourcePathExpression) -> None:
722 """Write the pipeline to a file or directory.
724 Parameters
725 ----------
726 uri : convertible to `ResourcePath`
727 URI to write to; may have any scheme with `ResourcePath` write
728 support or no scheme for a local file/directory. Should have a
729 ``.yaml``.
730 """
731 self._pipelineIR.write_to_uri(uri)
733 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
734 """Returns a generator of TaskDefs which can be used to create quantum
735 graphs.
737 Returns
738 -------
739 generator : generator of `TaskDef`
740 The generator returned will be the sorted iterator of tasks which
741 are to be used in constructing a quantum graph.
743 Raises
744 ------
745 NotImplementedError
746 If a dataId is supplied in a config block. This is in place for
747 future use
748 """
749 taskDefs = []
750 for label in self._pipelineIR.tasks:
751 taskDefs.append(self._buildTaskDef(label))
753 # lets evaluate the contracts
754 if self._pipelineIR.contracts is not None:
755 label_to_config = {x.label: x.config for x in taskDefs}
756 for contract in self._pipelineIR.contracts:
757 # execute this in its own line so it can raise a good error
758 # message if there was problems with the eval
759 success = eval(contract.contract, None, label_to_config)
760 if not success:
761 extra_info = f": {contract.msg}" if contract.msg is not None else ""
762 raise pipelineIR.ContractError(
763 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}"
764 )
766 taskDefs = sorted(taskDefs, key=lambda x: x.label)
767 yield from pipeTools.orderPipeline(taskDefs)
769 def _buildTaskDef(self, label: str) -> TaskDef:
770 if (taskIR := self._pipelineIR.tasks.get(label)) is None:
771 raise NameError(f"Label {label} does not appear in this pipeline")
772 taskClass: Type[PipelineTask] = doImportType(taskIR.klass)
773 taskName = get_full_type_name(taskClass)
774 config = taskClass.ConfigClass()
775 overrides = ConfigOverrides()
776 if self._pipelineIR.instrument is not None:
777 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName)
778 if taskIR.config is not None:
779 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config):
780 if configIR.dataId is not None:
781 raise NotImplementedError(
782 "Specializing a config on a partial data id is not yet "
783 "supported in Pipeline definition"
784 )
785 # only apply override if it applies to everything
786 if configIR.dataId is None:
787 if configIR.file:
788 for configFile in configIR.file:
789 overrides.addFileOverride(os.path.expandvars(configFile))
790 if configIR.python is not None:
791 overrides.addPythonOverride(configIR.python)
792 for key, value in configIR.rest.items():
793 overrides.addValueOverride(key, value)
794 overrides.applyTo(config)
795 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)
797 def __iter__(self) -> Generator[TaskDef, None, None]:
798 return self.toExpandedPipeline()
800 def __getitem__(self, item: str) -> TaskDef:
801 return self._buildTaskDef(item)
803 def __len__(self) -> int:
804 return len(self._pipelineIR.tasks)
806 def __eq__(self, other: object) -> bool:
807 if not isinstance(other, Pipeline):
808 return False
809 elif self._pipelineIR == other._pipelineIR:
810 # Shortcut: if the IR is the same, the expanded pipeline must be
811 # the same as well. But the converse is not true.
812 return True
813 else:
814 self_expanded = {td.label: (td.taskClass,) for td in self}
815 other_expanded = {td.label: (td.taskClass,) for td in other}
816 if self_expanded != other_expanded:
817 return False
818 # After DM-27847, we should compare configuration here, or better,
819 # delegated to TaskDef.__eq__ after making that compare configurations.
820 raise NotImplementedError(
821 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847."
822 )
825@dataclass(frozen=True)
826class TaskDatasetTypes:
827 """An immutable struct that extracts and classifies the dataset types used
828 by a `PipelineTask`
829 """
831 initInputs: NamedValueSet[DatasetType]
832 """Dataset types that are needed as inputs in order to construct this Task.
834 Task-level `initInputs` may be classified as either
835 `~PipelineDatasetTypes.initInputs` or
836 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
837 """
839 initOutputs: NamedValueSet[DatasetType]
840 """Dataset types that may be written after constructing this Task.
842 Task-level `initOutputs` may be classified as either
843 `~PipelineDatasetTypes.initOutputs` or
844 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
845 """
847 inputs: NamedValueSet[DatasetType]
848 """Dataset types that are regular inputs to this Task.
850 If an input dataset needed for a Quantum cannot be found in the input
851 collection(s) or produced by another Task in the Pipeline, that Quantum
852 (and all dependent Quanta) will not be produced.
854 Task-level `inputs` may be classified as either
855 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
856 at the Pipeline level.
857 """
859 prerequisites: NamedValueSet[DatasetType]
860 """Dataset types that are prerequisite inputs to this Task.
862 Prerequisite inputs must exist in the input collection(s) before the
863 pipeline is run, but do not constrain the graph - if a prerequisite is
864 missing for a Quantum, `PrerequisiteMissingError` is raised.
866 Prerequisite inputs are not resolved until the second stage of
867 QuantumGraph generation.
868 """
870 outputs: NamedValueSet[DatasetType]
871 """Dataset types that are produced by this Task.
873 Task-level `outputs` may be classified as either
874 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
875 at the Pipeline level.
876 """
878 @classmethod
879 def fromTaskDef(
880 cls,
881 taskDef: TaskDef,
882 *,
883 registry: Registry,
884 include_configs: bool = True,
885 storage_class_mapping: Optional[Mapping[str, str]] = None,
886 ) -> TaskDatasetTypes:
887 """Extract and classify the dataset types from a single `PipelineTask`.
889 Parameters
890 ----------
891 taskDef: `TaskDef`
892 An instance of a `TaskDef` class for a particular `PipelineTask`.
893 registry: `Registry`
894 Registry used to construct normalized `DatasetType` objects and
895 retrieve those that are incomplete.
896 include_configs : `bool`, optional
897 If `True` (default) include config dataset types as
898 ``initOutputs``.
899 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional
900 If a taskdef contains a component dataset type that is unknown
901 to the registry, its parent StorageClass will be looked up in this
902 mapping if it is supplied. If the mapping does not contain the
903 composite dataset type, or the mapping is not supplied an exception
904 will be raised.
906 Returns
907 -------
908 types: `TaskDatasetTypes`
909 The dataset types used by this task.
911 Raises
912 ------
913 ValueError
914 Raised if dataset type connection definition differs from
915 registry definition.
916 LookupError
917 Raised if component parent StorageClass could not be determined
918 and storage_class_mapping does not contain the composite type, or
919 is set to None.
920 """
922 def makeDatasetTypesSet(
923 connectionType: str,
924 is_input: bool,
925 freeze: bool = True,
926 ) -> NamedValueSet[DatasetType]:
927 """Constructs a set of true `DatasetType` objects
929 Parameters
930 ----------
931 connectionType : `str`
932 Name of the connection type to produce a set for, corresponds
933 to an attribute of type `list` on the connection class instance
934 is_input : `bool`
935 These are input dataset types, else they are output dataset
936 types.
937 freeze : `bool`, optional
938 If `True`, call `NamedValueSet.freeze` on the object returned.
940 Returns
941 -------
942 datasetTypes : `NamedValueSet`
943 A set of all datasetTypes which correspond to the input
944 connection type specified in the connection class of this
945 `PipelineTask`
947 Raises
948 ------
949 ValueError
950 Raised if dataset type connection definition differs from
951 registry definition.
952 LookupError
953 Raised if component parent StorageClass could not be determined
954 and storage_class_mapping does not contain the composite type,
955 or is set to None.
957 Notes
958 -----
959 This function is a closure over the variables ``registry`` and
960 ``taskDef``, and ``storage_class_mapping``.
961 """
962 datasetTypes = NamedValueSet[DatasetType]()
963 for c in iterConnections(taskDef.connections, connectionType):
964 dimensions = set(getattr(c, "dimensions", set()))
965 if "skypix" in dimensions:
966 try:
967 datasetType = registry.getDatasetType(c.name)
968 except LookupError as err:
969 raise LookupError(
970 f"DatasetType '{c.name}' referenced by "
971 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
972 "placeholder, but does not already exist in the registry. "
973 "Note that reference catalog names are now used as the dataset "
974 "type name instead of 'ref_cat'."
975 ) from err
976 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
977 rest2 = set(
978 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension)
979 )
980 if rest1 != rest2:
981 raise ValueError(
982 f"Non-skypix dimensions for dataset type {c.name} declared in "
983 f"connections ({rest1}) are inconsistent with those in "
984 f"registry's version of this dataset ({rest2})."
985 )
986 else:
987 # Component dataset types are not explicitly in the
988 # registry. This complicates consistency checks with
989 # registry and requires we work out the composite storage
990 # class.
991 registryDatasetType = None
992 try:
993 registryDatasetType = registry.getDatasetType(c.name)
994 except KeyError:
995 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
996 if componentName:
997 if storage_class_mapping is None or compositeName not in storage_class_mapping:
998 raise LookupError(
999 "Component parent class cannot be determined, and "
1000 "composite name was not in storage class mapping, or no "
1001 "storage_class_mapping was supplied"
1002 )
1003 else:
1004 parentStorageClass = storage_class_mapping[compositeName]
1005 else:
1006 parentStorageClass = None
1007 datasetType = c.makeDatasetType(
1008 registry.dimensions, parentStorageClass=parentStorageClass
1009 )
1010 registryDatasetType = datasetType
1011 else:
1012 datasetType = c.makeDatasetType(
1013 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass
1014 )
1016 if registryDatasetType and datasetType != registryDatasetType:
1017 # The dataset types differ but first check to see if
1018 # they are compatible before raising.
1019 if is_input:
1020 # This DatasetType must be compatible on get.
1021 is_compatible = datasetType.is_compatible_with(registryDatasetType)
1022 else:
1023 # Has to be able to be converted to expect type
1024 # on put.
1025 is_compatible = registryDatasetType.is_compatible_with(datasetType)
1026 if is_compatible:
1027 # For inputs we want the pipeline to use the
1028 # pipeline definition, for outputs it should use
1029 # the registry definition.
1030 if not is_input:
1031 datasetType = registryDatasetType
1032 _LOG.debug(
1033 "Dataset types differ (task %s != registry %s) but are compatible"
1034 " for %s in %s.",
1035 datasetType,
1036 registryDatasetType,
1037 "input" if is_input else "output",
1038 taskDef.label,
1039 )
1040 else:
1041 try:
1042 # Explicitly check for storage class just to
1043 # make more specific message.
1044 _ = datasetType.storageClass
1045 except KeyError:
1046 raise ValueError(
1047 "Storage class does not exist for supplied dataset type "
1048 f"{datasetType} for {taskDef.label}."
1049 ) from None
1050 raise ValueError(
1051 f"Supplied dataset type ({datasetType}) inconsistent with "
1052 f"registry definition ({registryDatasetType}) "
1053 f"for {taskDef.label}."
1054 )
1055 datasetTypes.add(datasetType)
1056 if freeze:
1057 datasetTypes.freeze()
1058 return datasetTypes
1060 # optionally add initOutput dataset for config
1061 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False)
1062 if include_configs:
1063 initOutputs.add(
1064 DatasetType(
1065 taskDef.configDatasetName,
1066 registry.dimensions.empty,
1067 storageClass="Config",
1068 )
1069 )
1070 initOutputs.freeze()
1072 # optionally add output dataset for metadata
1073 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False)
1074 if taskDef.metadataDatasetName is not None:
1075 # Metadata is supposed to be of the TaskMetadata type, its
1076 # dimensions correspond to a task quantum.
1077 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
1079 # Allow the storage class definition to be read from the existing
1080 # dataset type definition if present.
1081 try:
1082 current = registry.getDatasetType(taskDef.metadataDatasetName)
1083 except KeyError:
1084 # No previous definition so use the default.
1085 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet"
1086 else:
1087 storageClass = current.storageClass.name
1089 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)})
1090 if taskDef.logOutputDatasetName is not None:
1091 # Log output dimensions correspond to a task quantum.
1092 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
1093 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")})
1095 outputs.freeze()
1097 return cls(
1098 initInputs=makeDatasetTypesSet("initInputs", is_input=True),
1099 initOutputs=initOutputs,
1100 inputs=makeDatasetTypesSet("inputs", is_input=True),
1101 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True),
1102 outputs=outputs,
1103 )
1106@dataclass(frozen=True)
1107class PipelineDatasetTypes:
1108 """An immutable struct that classifies the dataset types used in a
1109 `Pipeline`.
1110 """
1112 packagesDatasetName: ClassVar[str] = "packages"
1113 """Name of a dataset type used to save package versions.
1114 """
1116 initInputs: NamedValueSet[DatasetType]
1117 """Dataset types that are needed as inputs in order to construct the Tasks
1118 in this Pipeline.
1120 This does not include dataset types that are produced when constructing
1121 other Tasks in the Pipeline (these are classified as `initIntermediates`).
1122 """
1124 initOutputs: NamedValueSet[DatasetType]
1125 """Dataset types that may be written after constructing the Tasks in this
1126 Pipeline.
1128 This does not include dataset types that are also used as inputs when
1129 constructing other Tasks in the Pipeline (these are classified as
1130 `initIntermediates`).
1131 """
1133 initIntermediates: NamedValueSet[DatasetType]
1134 """Dataset types that are both used when constructing one or more Tasks
1135 in the Pipeline and produced as a side-effect of constructing another
1136 Task in the Pipeline.
1137 """
1139 inputs: NamedValueSet[DatasetType]
1140 """Dataset types that are regular inputs for the full pipeline.
1142 If an input dataset needed for a Quantum cannot be found in the input
1143 collection(s), that Quantum (and all dependent Quanta) will not be
1144 produced.
1145 """
1147 prerequisites: NamedValueSet[DatasetType]
1148 """Dataset types that are prerequisite inputs for the full Pipeline.
1150 Prerequisite inputs must exist in the input collection(s) before the
1151 pipeline is run, but do not constrain the graph - if a prerequisite is
1152 missing for a Quantum, `PrerequisiteMissingError` is raised.
1154 Prerequisite inputs are not resolved until the second stage of
1155 QuantumGraph generation.
1156 """
1158 intermediates: NamedValueSet[DatasetType]
1159 """Dataset types that are output by one Task in the Pipeline and consumed
1160 as inputs by one or more other Tasks in the Pipeline.
1161 """
1163 outputs: NamedValueSet[DatasetType]
1164 """Dataset types that are output by a Task in the Pipeline and not consumed
1165 by any other Task in the Pipeline.
1166 """
1168 byTask: Mapping[str, TaskDatasetTypes]
1169 """Per-Task dataset types, keyed by label in the `Pipeline`.
1171 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
1172 neither has been modified since the dataset types were extracted, of
1173 course).
1174 """
1176 @classmethod
1177 def fromPipeline(
1178 cls,
1179 pipeline: Union[Pipeline, Iterable[TaskDef]],
1180 *,
1181 registry: Registry,
1182 include_configs: bool = True,
1183 include_packages: bool = True,
1184 ) -> PipelineDatasetTypes:
1185 """Extract and classify the dataset types from all tasks in a
1186 `Pipeline`.
1188 Parameters
1189 ----------
1190 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1191 A collection of tasks that can be run together.
1192 registry: `Registry`
1193 Registry used to construct normalized `DatasetType` objects and
1194 retrieve those that are incomplete.
1195 include_configs : `bool`, optional
1196 If `True` (default) include config dataset types as
1197 ``initOutputs``.
1198 include_packages : `bool`, optional
1199 If `True` (default) include the dataset type for software package
1200 versions in ``initOutputs``.
1202 Returns
1203 -------
1204 types: `PipelineDatasetTypes`
1205 The dataset types used by this `Pipeline`.
1207 Raises
1208 ------
1209 ValueError
1210 Raised if Tasks are inconsistent about which datasets are marked
1211 prerequisite. This indicates that the Tasks cannot be run as part
1212 of the same `Pipeline`.
1213 """
1214 allInputs = NamedValueSet[DatasetType]()
1215 allOutputs = NamedValueSet[DatasetType]()
1216 allInitInputs = NamedValueSet[DatasetType]()
1217 allInitOutputs = NamedValueSet[DatasetType]()
1218 prerequisites = NamedValueSet[DatasetType]()
1219 byTask = dict()
1220 if include_packages:
1221 allInitOutputs.add(
1222 DatasetType(
1223 cls.packagesDatasetName,
1224 registry.dimensions.empty,
1225 storageClass="Packages",
1226 )
1227 )
1228 # create a list of TaskDefs in case the input is a generator
1229 pipeline = list(pipeline)
1231 # collect all the output dataset types
1232 typeStorageclassMap: Dict[str, str] = {}
1233 for taskDef in pipeline:
1234 for outConnection in iterConnections(taskDef.connections, "outputs"):
1235 typeStorageclassMap[outConnection.name] = outConnection.storageClass
1237 for taskDef in pipeline:
1238 thisTask = TaskDatasetTypes.fromTaskDef(
1239 taskDef,
1240 registry=registry,
1241 include_configs=include_configs,
1242 storage_class_mapping=typeStorageclassMap,
1243 )
1244 allInitInputs.update(thisTask.initInputs)
1245 allInitOutputs.update(thisTask.initOutputs)
1246 allInputs.update(thisTask.inputs)
1247 prerequisites.update(thisTask.prerequisites)
1248 allOutputs.update(thisTask.outputs)
1249 byTask[taskDef.label] = thisTask
1250 if not prerequisites.isdisjoint(allInputs):
1251 raise ValueError(
1252 "{} marked as both prerequisites and regular inputs".format(
1253 {dt.name for dt in allInputs & prerequisites}
1254 )
1255 )
1256 if not prerequisites.isdisjoint(allOutputs):
1257 raise ValueError(
1258 "{} marked as both prerequisites and outputs".format(
1259 {dt.name for dt in allOutputs & prerequisites}
1260 )
1261 )
1262 # Make sure that components which are marked as inputs get treated as
1263 # intermediates if there is an output which produces the composite
1264 # containing the component
1265 intermediateComponents = NamedValueSet[DatasetType]()
1266 intermediateComposites = NamedValueSet[DatasetType]()
1267 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
1268 for dsType in allInputs:
1269 # get the name of a possible component
1270 name, component = dsType.nameAndComponent()
1271 # if there is a component name, that means this is a component
1272 # DatasetType, if there is an output which produces the parent of
1273 # this component, treat this input as an intermediate
1274 if component is not None:
1275 # This needs to be in this if block, because someone might have
1276 # a composite that is a pure input from existing data
1277 if name in outputNameMapping:
1278 intermediateComponents.add(dsType)
1279 intermediateComposites.add(outputNameMapping[name])
1281 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None:
1282 common = a.names & b.names
1283 for name in common:
1284 # Any compatibility is allowed. This function does not know
1285 # if a dataset type is to be used for input or output.
1286 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])):
1287 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
1289 checkConsistency(allInitInputs, allInitOutputs)
1290 checkConsistency(allInputs, allOutputs)
1291 checkConsistency(allInputs, intermediateComposites)
1292 checkConsistency(allOutputs, intermediateComposites)
1294 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]:
1295 assert isinstance(s, NamedValueSet)
1296 s.freeze()
1297 return s
1299 return cls(
1300 initInputs=frozen(allInitInputs - allInitOutputs),
1301 initIntermediates=frozen(allInitInputs & allInitOutputs),
1302 initOutputs=frozen(allInitOutputs - allInitInputs),
1303 inputs=frozen(allInputs - allOutputs - intermediateComponents),
1304 # If there are storage class differences in inputs and outputs
1305 # the intermediates have to choose priority. Here choose that
1306 # inputs to tasks much match the requested storage class by
1307 # applying the inputs over the top of the outputs.
1308 intermediates=frozen(allOutputs & allInputs | intermediateComponents),
1309 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1310 prerequisites=frozen(prerequisites),
1311 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1312 )
1314 @classmethod
1315 def initOutputNames(
1316 cls,
1317 pipeline: Union[Pipeline, Iterable[TaskDef]],
1318 *,
1319 include_configs: bool = True,
1320 include_packages: bool = True,
1321 ) -> Iterator[str]:
1322 """Return the names of dataset types ot task initOutputs, Configs,
1323 and package versions for a pipeline.
1325 Parameters
1326 ----------
1327 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1328 A `Pipeline` instance or collection of `TaskDef` instances.
1329 include_configs : `bool`, optional
1330 If `True` (default) include config dataset types.
1331 include_packages : `bool`, optional
1332 If `True` (default) include the dataset type for package versions.
1334 Yields
1335 ------
1336 datasetTypeName : `str`
1337 Name of the dataset type.
1338 """
1339 if include_packages:
1340 # Package versions dataset type
1341 yield cls.packagesDatasetName
1343 if isinstance(pipeline, Pipeline):
1344 pipeline = pipeline.toExpandedPipeline()
1346 for taskDef in pipeline:
1347 # all task InitOutputs
1348 for name in taskDef.connections.initOutputs:
1349 attribute = getattr(taskDef.connections, name)
1350 yield attribute.name
1352 # config dataset name
1353 if include_configs:
1354 yield taskDef.configDatasetName