Coverage for python/lsst/pipe/base/pipeline.py: 18%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining Pipeline class and related methods.
24"""
26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
28import copy
29import logging
30import os
31import re
32import urllib.parse
33import warnings
35# -------------------------------
36# Imports of standard modules --
37# -------------------------------
38from dataclasses import dataclass
39from types import MappingProxyType
40from typing import (
41 TYPE_CHECKING,
42 AbstractSet,
43 ClassVar,
44 Dict,
45 Generator,
46 Iterable,
47 Iterator,
48 Mapping,
49 Optional,
50 Set,
51 Tuple,
52 Type,
53 Union,
54)
56# -----------------------------
57# Imports for other modules --
58from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_full_type_name
63from . import pipelineIR, pipeTools
64from ._task_metadata import TaskMetadata
65from .configOverrides import ConfigOverrides
66from .connections import iterConnections
67from .pipelineTask import PipelineTask
68from .task import _TASK_METADATA_TYPE
70if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 70 ↛ 71line 70 didn't jump to line 71, because the condition on line 70 was never true
71 from lsst.obs.base import Instrument
72 from lsst.pex.config import Config
74# ----------------------------------
75# Local non-exported definitions --
76# ----------------------------------
78_LOG = logging.getLogger(__name__)
80# ------------------------
81# Exported definitions --
82# ------------------------
85@dataclass
86class LabelSpecifier:
87 """A structure to specify a subset of labels to load
89 This structure may contain a set of labels to be used in subsetting a
90 pipeline, or a beginning and end point. Beginning or end may be empty,
91 in which case the range will be a half open interval. Unlike python
92 iteration bounds, end bounds are *INCLUDED*. Note that range based
93 selection is not well defined for pipelines that are not linear in nature,
94 and correct behavior is not guaranteed, or may vary from run to run.
95 """
97 labels: Optional[Set[str]] = None
98 begin: Optional[str] = None
99 end: Optional[str] = None
101 def __post_init__(self) -> None:
102 if self.labels is not None and (self.begin or self.end):
103 raise ValueError(
104 "This struct can only be initialized with a labels set or a begin (and/or) end specifier"
105 )
108class TaskDef:
109 """TaskDef is a collection of information about task needed by Pipeline.
111 The information includes task name, configuration object and optional
112 task class. This class is just a collection of attributes and it exposes
113 all of them so that attributes could potentially be modified in place
114 (e.g. if configuration needs extra overrides).
116 Attributes
117 ----------
118 taskName : `str`, optional
119 `PipelineTask` class name, currently it is not specified whether this
120 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
121 Framework should be prepared to handle all cases. If not provided,
122 ``taskClass`` must be, and ``taskClass.__name__`` is used.
123 config : `lsst.pex.config.Config`, optional
124 Instance of the configuration class corresponding to this task class,
125 usually with all overrides applied. This config will be frozen. If
126 not provided, ``taskClass`` must be provided and
127 ``taskClass.ConfigClass()`` will be used.
128 taskClass : `type`, optional
129 `PipelineTask` class object, can be ``None``. If ``None`` then
130 framework will have to locate and load class.
131 label : `str`, optional
132 Task label, usually a short string unique in a pipeline. If not
133 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will
134 be used.
135 """
137 def __init__(
138 self,
139 taskName: Optional[str] = None,
140 config: Optional[Config] = None,
141 taskClass: Optional[Type[PipelineTask]] = None,
142 label: Optional[str] = None,
143 ):
144 if taskName is None:
145 if taskClass is None:
146 raise ValueError("At least one of `taskName` and `taskClass` must be provided.")
147 taskName = taskClass.__name__
148 if config is None:
149 if taskClass is None:
150 raise ValueError("`taskClass` must be provided if `config` is not.")
151 config = taskClass.ConfigClass()
152 if label is None:
153 if taskClass is None:
154 raise ValueError("`taskClass` must be provided if `label` is not.")
155 label = taskClass._DefaultName
156 self.taskName = taskName
157 try:
158 config.validate()
159 except Exception:
160 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
161 raise
162 config.freeze()
163 self.config = config
164 self.taskClass = taskClass
165 self.label = label
166 self.connections = config.connections.ConnectionsClass(config=config)
168 @property
169 def configDatasetName(self) -> str:
170 """Name of a dataset type for configuration of this task (`str`)"""
171 return self.label + "_config"
173 @property
174 def metadataDatasetName(self) -> Optional[str]:
175 """Name of a dataset type for metadata of this task, `None` if
176 metadata is not to be saved (`str`)
177 """
178 if self.config.saveMetadata:
179 return self.makeMetadataDatasetName(self.label)
180 else:
181 return None
183 @classmethod
184 def makeMetadataDatasetName(cls, label: str) -> str:
185 """Construct the name of the dataset type for metadata for a task.
187 Parameters
188 ----------
189 label : `str`
190 Label for the task within its pipeline.
192 Returns
193 -------
194 name : `str`
195 Name of the task's metadata dataset type.
196 """
197 return f"{label}_metadata"
199 @property
200 def logOutputDatasetName(self) -> Optional[str]:
201 """Name of a dataset type for log output from this task, `None` if
202 logs are not to be saved (`str`)
203 """
204 if self.config.saveLogOutput:
205 return self.label + "_log"
206 else:
207 return None
209 def __str__(self) -> str:
210 rep = "TaskDef(" + self.taskName
211 if self.label:
212 rep += ", label=" + self.label
213 rep += ")"
214 return rep
216 def __eq__(self, other: object) -> bool:
217 if not isinstance(other, TaskDef):
218 return False
219 # This does not consider equality of configs when determining equality
220 # as config equality is a difficult thing to define. Should be updated
221 # after DM-27847
222 return self.taskClass == other.taskClass and self.label == other.label
224 def __hash__(self) -> int:
225 return hash((self.taskClass, self.label))
228class Pipeline:
229 """A `Pipeline` is a representation of a series of tasks to run, and the
230 configuration for those tasks.
232 Parameters
233 ----------
234 description : `str`
235 A description of that this pipeline does.
236 """
238 def __init__(self, description: str):
239 pipeline_dict = {"description": description, "tasks": {}}
240 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
242 @classmethod
243 def fromFile(cls, filename: str) -> Pipeline:
244 """Load a pipeline defined in a pipeline yaml file.
246 Parameters
247 ----------
248 filename: `str`
249 A path that points to a pipeline defined in yaml format. This
250 filename may also supply additional labels to be used in
251 subsetting the loaded Pipeline. These labels are separated from
252 the path by a \\#, and may be specified as a comma separated
253 list, or a range denoted as beginning..end. Beginning or end may
254 be empty, in which case the range will be a half open interval.
255 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
256 that range based selection is not well defined for pipelines that
257 are not linear in nature, and correct behavior is not guaranteed,
258 or may vary from run to run.
260 Returns
261 -------
262 pipeline: `Pipeline`
263 The pipeline loaded from specified location with appropriate (if
264 any) subsetting
266 Notes
267 -----
268 This method attempts to prune any contracts that contain labels which
269 are not in the declared subset of labels. This pruning is done using a
270 string based matching due to the nature of contracts and may prune more
271 than it should.
272 """
273 return cls.from_uri(filename)
275 @classmethod
276 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline:
277 """Load a pipeline defined in a pipeline yaml file at a location
278 specified by a URI.
280 Parameters
281 ----------
282 uri: convertible to `ResourcePath`
283 If a string is supplied this should be a URI path that points to a
284 pipeline defined in yaml format, either as a direct path to the
285 yaml file, or as a directory containing a "pipeline.yaml" file (the
286 form used by `write_to_uri` with ``expand=True``). This uri may
287 also supply additional labels to be used in subsetting the loaded
288 Pipeline. These labels are separated from the path by a \\#, and
289 may be specified as a comma separated list, or a range denoted as
290 beginning..end. Beginning or end may be empty, in which case the
291 range will be a half open interval. Unlike python iteration bounds,
292 end bounds are *INCLUDED*. Note that range based selection is not
293 well defined for pipelines that are not linear in nature, and
294 correct behavior is not guaranteed, or may vary from run to run.
295 The same specifiers can be used with a `ResourcePath` object, by
296 being the sole contents in the fragments attribute.
298 Returns
299 -------
300 pipeline: `Pipeline`
301 The pipeline loaded from specified location with appropriate (if
302 any) subsetting
304 Notes
305 -----
306 This method attempts to prune any contracts that contain labels which
307 are not in the declared subset of labels. This pruning is done using a
308 string based matching due to the nature of contracts and may prune more
309 than it should.
310 """
311 # Split up the uri and any labels that were supplied
312 uri, label_specifier = cls._parse_file_specifier(uri)
313 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
315 # If there are labels supplied, only keep those
316 if label_specifier is not None:
317 pipeline = pipeline.subsetFromLabels(label_specifier)
318 return pipeline
320 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
321 """Subset a pipeline to contain only labels specified in labelSpecifier
323 Parameters
324 ----------
325 labelSpecifier : `labelSpecifier`
326 Object containing labels that describes how to subset a pipeline.
328 Returns
329 -------
330 pipeline : `Pipeline`
331 A new pipeline object that is a subset of the old pipeline
333 Raises
334 ------
335 ValueError
336 Raised if there is an issue with specified labels
338 Notes
339 -----
340 This method attempts to prune any contracts that contain labels which
341 are not in the declared subset of labels. This pruning is done using a
342 string based matching due to the nature of contracts and may prune more
343 than it should.
344 """
345 # Labels supplied as a set
346 if labelSpecifier.labels:
347 labelSet = labelSpecifier.labels
348 # Labels supplied as a range, first create a list of all the labels
349 # in the pipeline sorted according to task dependency. Then only
350 # keep labels that lie between the supplied bounds
351 else:
352 # Create a copy of the pipeline to use when assessing the label
353 # ordering. Use a dict for fast searching while preserving order.
354 # Remove contracts so they do not fail in the expansion step. This
355 # is needed because a user may only configure the tasks they intend
356 # to run, which may cause some contracts to fail if they will later
357 # be dropped
358 pipeline = copy.deepcopy(self)
359 pipeline._pipelineIR.contracts = []
360 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
362 # Verify the bounds are in the labels
363 if labelSpecifier.begin is not None:
364 if labelSpecifier.begin not in labels:
365 raise ValueError(
366 f"Beginning of range subset, {labelSpecifier.begin}, not found in "
367 "pipeline definition"
368 )
369 if labelSpecifier.end is not None:
370 if labelSpecifier.end not in labels:
371 raise ValueError(
372 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition"
373 )
375 labelSet = set()
376 for label in labels:
377 if labelSpecifier.begin is not None:
378 if label != labelSpecifier.begin:
379 continue
380 else:
381 labelSpecifier.begin = None
382 labelSet.add(label)
383 if labelSpecifier.end is not None and label == labelSpecifier.end:
384 break
385 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
387 @staticmethod
388 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]:
389 """Split appart a uri and any possible label subsets"""
390 if isinstance(uri, str):
391 # This is to support legacy pipelines during transition
392 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
393 if num_replace:
394 warnings.warn(
395 f"The pipeline file {uri} seems to use the legacy : to separate "
396 "labels, this is deprecated and will be removed after June 2021, please use "
397 "# instead.",
398 category=FutureWarning,
399 )
400 if uri.count("#") > 1:
401 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
402 # Everything else can be converted directly to ResourcePath.
403 uri = ResourcePath(uri)
404 label_subset = uri.fragment or None
406 specifier: Optional[LabelSpecifier]
407 if label_subset is not None:
408 label_subset = urllib.parse.unquote(label_subset)
409 args: Dict[str, Union[Set[str], str, None]]
410 # labels supplied as a list
411 if "," in label_subset:
412 if ".." in label_subset:
413 raise ValueError(
414 "Can only specify a list of labels or a rangewhen loading a Pipline not both"
415 )
416 args = {"labels": set(label_subset.split(","))}
417 # labels supplied as a range
418 elif ".." in label_subset:
419 # Try to de-structure the labelSubset, this will fail if more
420 # than one range is specified
421 begin, end, *rest = label_subset.split("..")
422 if rest:
423 raise ValueError("Only one range can be specified when loading a pipeline")
424 args = {"begin": begin if begin else None, "end": end if end else None}
425 # Assume anything else is a single label
426 else:
427 args = {"labels": {label_subset}}
429 # MyPy doesn't like how cavalier kwarg construction is with types.
430 specifier = LabelSpecifier(**args) # type: ignore
431 else:
432 specifier = None
434 return uri, specifier
436 @classmethod
437 def fromString(cls, pipeline_string: str) -> Pipeline:
438 """Create a pipeline from string formatted as a pipeline document.
440 Parameters
441 ----------
442 pipeline_string : `str`
443 A string that is formatted according like a pipeline document
445 Returns
446 -------
447 pipeline: `Pipeline`
448 """
449 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
450 return pipeline
452 @classmethod
453 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
454 """Create a pipeline from an already created `PipelineIR` object.
456 Parameters
457 ----------
458 deserialized_pipeline: `PipelineIR`
459 An already created pipeline intermediate representation object
461 Returns
462 -------
463 pipeline: `Pipeline`
464 """
465 pipeline = cls.__new__(cls)
466 pipeline._pipelineIR = deserialized_pipeline
467 return pipeline
469 @classmethod
470 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline:
471 """Create a new pipeline by copying an already existing `Pipeline`.
473 Parameters
474 ----------
475 pipeline: `Pipeline`
476 An already created pipeline intermediate representation object
478 Returns
479 -------
480 pipeline: `Pipeline`
481 """
482 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
484 def __str__(self) -> str:
485 # tasks need sorted each call because someone might have added or
486 # removed task, and caching changes does not seem worth the small
487 # overhead
488 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
489 self._pipelineIR.reorder_tasks(labels)
490 return str(self._pipelineIR)
492 def addInstrument(self, instrument: Union[Instrument, str]) -> None:
493 """Add an instrument to the pipeline, or replace an instrument that is
494 already defined.
496 Parameters
497 ----------
498 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
499 Either a derived class object of a `lsst.daf.butler.instrument` or
500 a string corresponding to a fully qualified
501 `lsst.daf.butler.instrument` name.
502 """
503 if isinstance(instrument, str):
504 pass
505 else:
506 # TODO: assume that this is a subclass of Instrument, no type
507 # checking
508 instrument = get_full_type_name(instrument)
509 self._pipelineIR.instrument = instrument
511 def getInstrument(self) -> Optional[str]:
512 """Get the instrument from the pipeline.
514 Returns
515 -------
516 instrument : `str`, or None
517 The fully qualified name of a `lsst.obs.base.Instrument` subclass,
518 name, or None if the pipeline does not have an instrument.
519 """
520 return self._pipelineIR.instrument
522 def addTask(self, task: Union[Type[PipelineTask], str], label: str) -> None:
523 """Add a new task to the pipeline, or replace a task that is already
524 associated with the supplied label.
526 Parameters
527 ----------
528 task: `PipelineTask` or `str`
529 Either a derived class object of a `PipelineTask` or a string
530 corresponding to a fully qualified `PipelineTask` name.
531 label: `str`
532 A label that is used to identify the `PipelineTask` being added
533 """
534 if isinstance(task, str):
535 taskName = task
536 elif issubclass(task, PipelineTask):
537 taskName = get_full_type_name(task)
538 else:
539 raise ValueError(
540 "task must be either a child class of PipelineTask or a string containing"
541 " a fully qualified name to one"
542 )
543 if not label:
544 # in some cases (with command line-generated pipeline) tasks can
545 # be defined without label which is not acceptable, use task
546 # _DefaultName in that case
547 if isinstance(task, str):
548 task_class = doImportType(task)
549 label = task_class._DefaultName
550 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
552 def removeTask(self, label: str) -> None:
553 """Remove a task from the pipeline.
555 Parameters
556 ----------
557 label : `str`
558 The label used to identify the task that is to be removed
560 Raises
561 ------
562 KeyError
563 If no task with that label exists in the pipeline
565 """
566 self._pipelineIR.tasks.pop(label)
568 def addConfigOverride(self, label: str, key: str, value: object) -> None:
569 """Apply single config override.
571 Parameters
572 ----------
573 label : `str`
574 Label of the task.
575 key: `str`
576 Fully-qualified field name.
577 value : object
578 Value to be given to a field.
579 """
580 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
582 def addConfigFile(self, label: str, filename: str) -> None:
583 """Add overrides from a specified file.
585 Parameters
586 ----------
587 label : `str`
588 The label used to identify the task associated with config to
589 modify
590 filename : `str`
591 Path to the override file.
592 """
593 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
595 def addConfigPython(self, label: str, pythonString: str) -> None:
596 """Add Overrides by running a snippet of python code against a config.
598 Parameters
599 ----------
600 label : `str`
601 The label used to identity the task associated with config to
602 modify.
603 pythonString: `str`
604 A string which is valid python code to be executed. This is done
605 with config as the only local accessible value.
606 """
607 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
609 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
610 if label == "parameters":
611 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys():
612 raise ValueError("Cannot override parameters that are not defined in pipeline")
613 self._pipelineIR.parameters.mapping.update(newConfig.rest)
614 if newConfig.file:
615 raise ValueError("Setting parameters section with config file is not supported")
616 if newConfig.python:
617 raise ValueError("Setting parameters section using python block in unsupported")
618 return
619 if label not in self._pipelineIR.tasks:
620 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
621 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
623 def toFile(self, filename: str) -> None:
624 self._pipelineIR.to_file(filename)
626 def write_to_uri(self, uri: ResourcePathExpression) -> None:
627 """Write the pipeline to a file or directory.
629 Parameters
630 ----------
631 uri : convertible to `ResourcePath`
632 URI to write to; may have any scheme with `ResourcePath` write
633 support or no scheme for a local file/directory. Should have a
634 ``.yaml``.
635 """
636 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
637 self._pipelineIR.reorder_tasks(labels)
638 self._pipelineIR.write_to_uri(uri)
640 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
641 """Returns a generator of TaskDefs which can be used to create quantum
642 graphs.
644 Returns
645 -------
646 generator : generator of `TaskDef`
647 The generator returned will be the sorted iterator of tasks which
648 are to be used in constructing a quantum graph.
650 Raises
651 ------
652 NotImplementedError
653 If a dataId is supplied in a config block. This is in place for
654 future use
655 """
656 yield from self._toExpandedPipelineImpl()
658 def _toExpandedPipelineImpl(self, checkContracts: bool = True) -> Iterable[TaskDef]:
659 taskDefs = []
660 for label in self._pipelineIR.tasks:
661 taskDefs.append(self._buildTaskDef(label))
663 # lets evaluate the contracts
664 if self._pipelineIR.contracts is not None:
665 label_to_config = {x.label: x.config for x in taskDefs}
666 for contract in self._pipelineIR.contracts:
667 # execute this in its own line so it can raise a good error
668 # message if there was problems with the eval
669 success = eval(contract.contract, None, label_to_config)
670 if not success:
671 extra_info = f": {contract.msg}" if contract.msg is not None else ""
672 raise pipelineIR.ContractError(
673 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}"
674 )
676 taskDefs = sorted(taskDefs, key=lambda x: x.label)
677 yield from pipeTools.orderPipeline(taskDefs)
679 def _buildTaskDef(self, label: str) -> TaskDef:
680 if (taskIR := self._pipelineIR.tasks.get(label)) is None:
681 raise NameError(f"Label {label} does not appear in this pipeline")
682 taskClass: Type[PipelineTask] = doImportType(taskIR.klass)
683 taskName = taskClass.__qualname__
684 config = taskClass.ConfigClass()
685 overrides = ConfigOverrides()
686 if self._pipelineIR.instrument is not None:
687 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName)
688 if taskIR.config is not None:
689 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config):
690 if configIR.dataId is not None:
691 raise NotImplementedError(
692 "Specializing a config on a partial data id is not yet "
693 "supported in Pipeline definition"
694 )
695 # only apply override if it applies to everything
696 if configIR.dataId is None:
697 if configIR.file:
698 for configFile in configIR.file:
699 overrides.addFileOverride(os.path.expandvars(configFile))
700 if configIR.python is not None:
701 overrides.addPythonOverride(configIR.python)
702 for key, value in configIR.rest.items():
703 overrides.addValueOverride(key, value)
704 overrides.applyTo(config)
705 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)
707 def __iter__(self) -> Generator[TaskDef, None, None]:
708 return self.toExpandedPipeline()
710 def __getitem__(self, item: str) -> TaskDef:
711 return self._buildTaskDef(item)
713 def __len__(self) -> int:
714 return len(self._pipelineIR.tasks)
716 def __eq__(self, other: object) -> bool:
717 if not isinstance(other, Pipeline):
718 return False
719 return self._pipelineIR == other._pipelineIR
722@dataclass(frozen=True)
723class TaskDatasetTypes:
724 """An immutable struct that extracts and classifies the dataset types used
725 by a `PipelineTask`
726 """
728 initInputs: NamedValueSet[DatasetType]
729 """Dataset types that are needed as inputs in order to construct this Task.
731 Task-level `initInputs` may be classified as either
732 `~PipelineDatasetTypes.initInputs` or
733 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
734 """
736 initOutputs: NamedValueSet[DatasetType]
737 """Dataset types that may be written after constructing this Task.
739 Task-level `initOutputs` may be classified as either
740 `~PipelineDatasetTypes.initOutputs` or
741 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
742 """
744 inputs: NamedValueSet[DatasetType]
745 """Dataset types that are regular inputs to this Task.
747 If an input dataset needed for a Quantum cannot be found in the input
748 collection(s) or produced by another Task in the Pipeline, that Quantum
749 (and all dependent Quanta) will not be produced.
751 Task-level `inputs` may be classified as either
752 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
753 at the Pipeline level.
754 """
756 prerequisites: NamedValueSet[DatasetType]
757 """Dataset types that are prerequisite inputs to this Task.
759 Prerequisite inputs must exist in the input collection(s) before the
760 pipeline is run, but do not constrain the graph - if a prerequisite is
761 missing for a Quantum, `PrerequisiteMissingError` is raised.
763 Prerequisite inputs are not resolved until the second stage of
764 QuantumGraph generation.
765 """
767 outputs: NamedValueSet[DatasetType]
768 """Dataset types that are produced by this Task.
770 Task-level `outputs` may be classified as either
771 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
772 at the Pipeline level.
773 """
775 @classmethod
776 def fromTaskDef(
777 cls,
778 taskDef: TaskDef,
779 *,
780 registry: Registry,
781 include_configs: bool = True,
782 storage_class_mapping: Optional[Mapping[str, str]] = None,
783 ) -> TaskDatasetTypes:
784 """Extract and classify the dataset types from a single `PipelineTask`.
786 Parameters
787 ----------
788 taskDef: `TaskDef`
789 An instance of a `TaskDef` class for a particular `PipelineTask`.
790 registry: `Registry`
791 Registry used to construct normalized `DatasetType` objects and
792 retrieve those that are incomplete.
793 include_configs : `bool`, optional
794 If `True` (default) include config dataset types as
795 ``initOutputs``.
796 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional
797 If a taskdef contains a component dataset type that is unknown
798 to the registry, its parent StorageClass will be looked up in this
799 mapping if it is supplied. If the mapping does not contain the
800 composite dataset type, or the mapping is not supplied an exception
801 will be raised.
803 Returns
804 -------
805 types: `TaskDatasetTypes`
806 The dataset types used by this task.
808 Raises
809 ------
810 ValueError
811 Raised if dataset type connection definition differs from
812 registry definition.
813 LookupError
814 Raised if component parent StorageClass could not be determined
815 and storage_class_mapping does not contain the composite type, or
816 is set to None.
817 """
819 def makeDatasetTypesSet(
820 connectionType: str,
821 is_input: bool,
822 freeze: bool = True,
823 ) -> NamedValueSet[DatasetType]:
824 """Constructs a set of true `DatasetType` objects
826 Parameters
827 ----------
828 connectionType : `str`
829 Name of the connection type to produce a set for, corresponds
830 to an attribute of type `list` on the connection class instance
831 is_input : `bool`
832 These are input dataset types, else they are output dataset
833 types.
834 freeze : `bool`, optional
835 If `True`, call `NamedValueSet.freeze` on the object returned.
837 Returns
838 -------
839 datasetTypes : `NamedValueSet`
840 A set of all datasetTypes which correspond to the input
841 connection type specified in the connection class of this
842 `PipelineTask`
844 Raises
845 ------
846 ValueError
847 Raised if dataset type connection definition differs from
848 registry definition.
849 LookupError
850 Raised if component parent StorageClass could not be determined
851 and storage_class_mapping does not contain the composite type,
852 or is set to None.
854 Notes
855 -----
856 This function is a closure over the variables ``registry`` and
857 ``taskDef``, and ``storage_class_mapping``.
858 """
859 datasetTypes = NamedValueSet[DatasetType]()
860 for c in iterConnections(taskDef.connections, connectionType):
861 dimensions = set(getattr(c, "dimensions", set()))
862 if "skypix" in dimensions:
863 try:
864 datasetType = registry.getDatasetType(c.name)
865 except LookupError as err:
866 raise LookupError(
867 f"DatasetType '{c.name}' referenced by "
868 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
869 f"placeholder, but does not already exist in the registry. "
870 f"Note that reference catalog names are now used as the dataset "
871 f"type name instead of 'ref_cat'."
872 ) from err
873 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
874 rest2 = set(
875 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension)
876 )
877 if rest1 != rest2:
878 raise ValueError(
879 f"Non-skypix dimensions for dataset type {c.name} declared in "
880 f"connections ({rest1}) are inconsistent with those in "
881 f"registry's version of this dataset ({rest2})."
882 )
883 else:
884 # Component dataset types are not explicitly in the
885 # registry. This complicates consistency checks with
886 # registry and requires we work out the composite storage
887 # class.
888 registryDatasetType = None
889 try:
890 registryDatasetType = registry.getDatasetType(c.name)
891 except KeyError:
892 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
893 if componentName:
894 if storage_class_mapping is None or compositeName not in storage_class_mapping:
895 raise LookupError(
896 "Component parent class cannot be determined, and "
897 "composite name was not in storage class mapping, or no "
898 "storage_class_mapping was supplied"
899 )
900 else:
901 parentStorageClass = storage_class_mapping[compositeName]
902 else:
903 parentStorageClass = None
904 datasetType = c.makeDatasetType(
905 registry.dimensions, parentStorageClass=parentStorageClass
906 )
907 registryDatasetType = datasetType
908 else:
909 datasetType = c.makeDatasetType(
910 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass
911 )
913 if registryDatasetType and datasetType != registryDatasetType:
914 # The dataset types differ but first check to see if
915 # they are compatible before raising.
916 if is_input:
917 # This DatasetType must be compatible on get.
918 is_compatible = datasetType.is_compatible_with(registryDatasetType)
919 else:
920 # Has to be able to be converted to expect type
921 # on put.
922 is_compatible = registryDatasetType.is_compatible_with(datasetType)
923 if is_compatible:
924 # For inputs we want the pipeline to use the
925 # pipeline definition, for outputs it should use
926 # the registry definition.
927 if not is_input:
928 datasetType = registryDatasetType
929 _LOG.debug(
930 "Dataset types differ (task %s != registry %s) but are compatible"
931 " for %s in %s.",
932 datasetType,
933 registryDatasetType,
934 "input" if is_input else "output",
935 taskDef.label,
936 )
937 else:
938 try:
939 # Explicitly check for storage class just to
940 # make more specific message.
941 _ = datasetType.storageClass
942 except KeyError:
943 raise ValueError(
944 "Storage class does not exist for supplied dataset type "
945 f"{datasetType} for {taskDef.label}."
946 ) from None
947 raise ValueError(
948 f"Supplied dataset type ({datasetType}) inconsistent with "
949 f"registry definition ({registryDatasetType}) "
950 f"for {taskDef.label}."
951 )
952 datasetTypes.add(datasetType)
953 if freeze:
954 datasetTypes.freeze()
955 return datasetTypes
957 # optionally add initOutput dataset for config
958 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False)
959 if include_configs:
960 initOutputs.add(
961 DatasetType(
962 taskDef.configDatasetName,
963 registry.dimensions.empty,
964 storageClass="Config",
965 )
966 )
967 initOutputs.freeze()
969 # optionally add output dataset for metadata
970 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False)
971 if taskDef.metadataDatasetName is not None:
972 # Metadata is supposed to be of the TaskMetadata type, its
973 # dimensions correspond to a task quantum.
974 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
976 # Allow the storage class definition to be read from the existing
977 # dataset type definition if present.
978 try:
979 current = registry.getDatasetType(taskDef.metadataDatasetName)
980 except KeyError:
981 # No previous definition so use the default.
982 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet"
983 else:
984 storageClass = current.storageClass.name
986 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)})
987 if taskDef.logOutputDatasetName is not None:
988 # Log output dimensions correspond to a task quantum.
989 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
990 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")})
992 outputs.freeze()
994 return cls(
995 initInputs=makeDatasetTypesSet("initInputs", is_input=True),
996 initOutputs=initOutputs,
997 inputs=makeDatasetTypesSet("inputs", is_input=True),
998 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True),
999 outputs=outputs,
1000 )
1003@dataclass(frozen=True)
1004class PipelineDatasetTypes:
1005 """An immutable struct that classifies the dataset types used in a
1006 `Pipeline`.
1007 """
1009 packagesDatasetName: ClassVar[str] = "packages"
1010 """Name of a dataset type used to save package versions.
1011 """
1013 initInputs: NamedValueSet[DatasetType]
1014 """Dataset types that are needed as inputs in order to construct the Tasks
1015 in this Pipeline.
1017 This does not include dataset types that are produced when constructing
1018 other Tasks in the Pipeline (these are classified as `initIntermediates`).
1019 """
1021 initOutputs: NamedValueSet[DatasetType]
1022 """Dataset types that may be written after constructing the Tasks in this
1023 Pipeline.
1025 This does not include dataset types that are also used as inputs when
1026 constructing other Tasks in the Pipeline (these are classified as
1027 `initIntermediates`).
1028 """
1030 initIntermediates: NamedValueSet[DatasetType]
1031 """Dataset types that are both used when constructing one or more Tasks
1032 in the Pipeline and produced as a side-effect of constructing another
1033 Task in the Pipeline.
1034 """
1036 inputs: NamedValueSet[DatasetType]
1037 """Dataset types that are regular inputs for the full pipeline.
1039 If an input dataset needed for a Quantum cannot be found in the input
1040 collection(s), that Quantum (and all dependent Quanta) will not be
1041 produced.
1042 """
1044 prerequisites: NamedValueSet[DatasetType]
1045 """Dataset types that are prerequisite inputs for the full Pipeline.
1047 Prerequisite inputs must exist in the input collection(s) before the
1048 pipeline is run, but do not constrain the graph - if a prerequisite is
1049 missing for a Quantum, `PrerequisiteMissingError` is raised.
1051 Prerequisite inputs are not resolved until the second stage of
1052 QuantumGraph generation.
1053 """
1055 intermediates: NamedValueSet[DatasetType]
1056 """Dataset types that are output by one Task in the Pipeline and consumed
1057 as inputs by one or more other Tasks in the Pipeline.
1058 """
1060 outputs: NamedValueSet[DatasetType]
1061 """Dataset types that are output by a Task in the Pipeline and not consumed
1062 by any other Task in the Pipeline.
1063 """
1065 byTask: Mapping[str, TaskDatasetTypes]
1066 """Per-Task dataset types, keyed by label in the `Pipeline`.
1068 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
1069 neither has been modified since the dataset types were extracted, of
1070 course).
1071 """
1073 @classmethod
1074 def fromPipeline(
1075 cls,
1076 pipeline: Union[Pipeline, Iterable[TaskDef]],
1077 *,
1078 registry: Registry,
1079 include_configs: bool = True,
1080 include_packages: bool = True,
1081 ) -> PipelineDatasetTypes:
1082 """Extract and classify the dataset types from all tasks in a
1083 `Pipeline`.
1085 Parameters
1086 ----------
1087 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1088 A collection of tasks that can be run together.
1089 registry: `Registry`
1090 Registry used to construct normalized `DatasetType` objects and
1091 retrieve those that are incomplete.
1092 include_configs : `bool`, optional
1093 If `True` (default) include config dataset types as
1094 ``initOutputs``.
1095 include_packages : `bool`, optional
1096 If `True` (default) include the dataset type for software package
1097 versions in ``initOutputs``.
1099 Returns
1100 -------
1101 types: `PipelineDatasetTypes`
1102 The dataset types used by this `Pipeline`.
1104 Raises
1105 ------
1106 ValueError
1107 Raised if Tasks are inconsistent about which datasets are marked
1108 prerequisite. This indicates that the Tasks cannot be run as part
1109 of the same `Pipeline`.
1110 """
1111 allInputs = NamedValueSet[DatasetType]()
1112 allOutputs = NamedValueSet[DatasetType]()
1113 allInitInputs = NamedValueSet[DatasetType]()
1114 allInitOutputs = NamedValueSet[DatasetType]()
1115 prerequisites = NamedValueSet[DatasetType]()
1116 byTask = dict()
1117 if include_packages:
1118 allInitOutputs.add(
1119 DatasetType(
1120 cls.packagesDatasetName,
1121 registry.dimensions.empty,
1122 storageClass="Packages",
1123 )
1124 )
1125 # create a list of TaskDefs in case the input is a generator
1126 pipeline = list(pipeline)
1128 # collect all the output dataset types
1129 typeStorageclassMap: Dict[str, str] = {}
1130 for taskDef in pipeline:
1131 for outConnection in iterConnections(taskDef.connections, "outputs"):
1132 typeStorageclassMap[outConnection.name] = outConnection.storageClass
1134 for taskDef in pipeline:
1135 thisTask = TaskDatasetTypes.fromTaskDef(
1136 taskDef,
1137 registry=registry,
1138 include_configs=include_configs,
1139 storage_class_mapping=typeStorageclassMap,
1140 )
1141 allInitInputs.update(thisTask.initInputs)
1142 allInitOutputs.update(thisTask.initOutputs)
1143 allInputs.update(thisTask.inputs)
1144 prerequisites.update(thisTask.prerequisites)
1145 allOutputs.update(thisTask.outputs)
1146 byTask[taskDef.label] = thisTask
1147 if not prerequisites.isdisjoint(allInputs):
1148 raise ValueError(
1149 "{} marked as both prerequisites and regular inputs".format(
1150 {dt.name for dt in allInputs & prerequisites}
1151 )
1152 )
1153 if not prerequisites.isdisjoint(allOutputs):
1154 raise ValueError(
1155 "{} marked as both prerequisites and outputs".format(
1156 {dt.name for dt in allOutputs & prerequisites}
1157 )
1158 )
1159 # Make sure that components which are marked as inputs get treated as
1160 # intermediates if there is an output which produces the composite
1161 # containing the component
1162 intermediateComponents = NamedValueSet[DatasetType]()
1163 intermediateComposites = NamedValueSet[DatasetType]()
1164 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
1165 for dsType in allInputs:
1166 # get the name of a possible component
1167 name, component = dsType.nameAndComponent()
1168 # if there is a component name, that means this is a component
1169 # DatasetType, if there is an output which produces the parent of
1170 # this component, treat this input as an intermediate
1171 if component is not None:
1172 # This needs to be in this if block, because someone might have
1173 # a composite that is a pure input from existing data
1174 if name in outputNameMapping:
1175 intermediateComponents.add(dsType)
1176 intermediateComposites.add(outputNameMapping[name])
1178 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None:
1179 common = a.names & b.names
1180 for name in common:
1181 # Any compatibility is allowed. This function does not know
1182 # if a dataset type is to be used for input or output.
1183 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])):
1184 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
1186 checkConsistency(allInitInputs, allInitOutputs)
1187 checkConsistency(allInputs, allOutputs)
1188 checkConsistency(allInputs, intermediateComposites)
1189 checkConsistency(allOutputs, intermediateComposites)
1191 def frozen(s: AbstractSet[DatasetType]) -> NamedValueSet[DatasetType]:
1192 assert isinstance(s, NamedValueSet)
1193 s.freeze()
1194 return s
1196 return cls(
1197 initInputs=frozen(allInitInputs - allInitOutputs),
1198 initIntermediates=frozen(allInitInputs & allInitOutputs),
1199 initOutputs=frozen(allInitOutputs - allInitInputs),
1200 inputs=frozen(allInputs - allOutputs - intermediateComponents),
1201 # If there are storage class differences in inputs and outputs
1202 # the intermediates have to choose priority. Here choose that
1203 # inputs to tasks much match the requested storage class by
1204 # applying the inputs over the top of the outputs.
1205 intermediates=frozen(allOutputs & allInputs | intermediateComponents),
1206 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1207 prerequisites=frozen(prerequisites),
1208 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1209 )
1211 @classmethod
1212 def initOutputNames(
1213 cls,
1214 pipeline: Union[Pipeline, Iterable[TaskDef]],
1215 *,
1216 include_configs: bool = True,
1217 include_packages: bool = True,
1218 ) -> Iterator[str]:
1219 """Return the names of dataset types ot task initOutputs, Configs,
1220 and package versions for a pipeline.
1222 Parameters
1223 ----------
1224 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1225 A `Pipeline` instance or collection of `TaskDef` instances.
1226 include_configs : `bool`, optional
1227 If `True` (default) include config dataset types.
1228 include_packages : `bool`, optional
1229 If `True` (default) include the dataset type for package versions.
1231 Yields
1232 ------
1233 datasetTypeName : `str`
1234 Name of the dataset type.
1235 """
1236 if include_packages:
1237 # Package versions dataset type
1238 yield cls.packagesDatasetName
1240 if isinstance(pipeline, Pipeline):
1241 pipeline = pipeline.toExpandedPipeline()
1243 for taskDef in pipeline:
1245 # all task InitOutputs
1246 for name in taskDef.connections.initOutputs:
1247 attribute = getattr(taskDef.connections, name)
1248 yield attribute.name
1250 # config dataset name
1251 if include_configs:
1252 yield taskDef.configDatasetName