Coverage for python/lsst/pipe/base/pipeline.py: 19%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining Pipeline class and related methods.
24"""
26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31from dataclasses import dataclass
32import logging
33from types import MappingProxyType
34from typing import (ClassVar, Dict, Iterable, Iterator, Mapping, Set, Union,
35 Generator, TYPE_CHECKING, Optional, Tuple)
37import copy
38import re
39import os
40import urllib.parse
41import warnings
43# -----------------------------
44# Imports for other modules --
45from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension, ButlerURI
46from lsst.utils import doImport
47from .configOverrides import ConfigOverrides
48from .connections import iterConnections
49from .pipelineTask import PipelineTask
50from .task import _TASK_METADATA_TYPE
51from ._task_metadata import TaskMetadata
53from . import pipelineIR
54from . import pipeTools
56if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true
57 from lsst.obs.base import Instrument
59# ----------------------------------
60# Local non-exported definitions --
61# ----------------------------------
63_LOG = logging.getLogger(__name__)
65# ------------------------
66# Exported definitions --
67# ------------------------
70@dataclass
71class LabelSpecifier:
72 """A structure to specify a subset of labels to load
74 This structure may contain a set of labels to be used in subsetting a
75 pipeline, or a beginning and end point. Beginning or end may be empty,
76 in which case the range will be a half open interval. Unlike python
77 iteration bounds, end bounds are *INCLUDED*. Note that range based
78 selection is not well defined for pipelines that are not linear in nature,
79 and correct behavior is not guaranteed, or may vary from run to run.
80 """
81 labels: Optional[Set[str]] = None
82 begin: Optional[str] = None
83 end: Optional[str] = None
85 def __post_init__(self):
86 if self.labels is not None and (self.begin or self.end):
87 raise ValueError("This struct can only be initialized with a labels set or "
88 "a begin (and/or) end specifier")
91class TaskDef:
92 """TaskDef is a collection of information about task needed by Pipeline.
94 The information includes task name, configuration object and optional
95 task class. This class is just a collection of attributes and it exposes
96 all of them so that attributes could potentially be modified in place
97 (e.g. if configuration needs extra overrides).
99 Attributes
100 ----------
101 taskName : `str`, optional
102 `PipelineTask` class name, currently it is not specified whether this
103 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
104 Framework should be prepared to handle all cases. If not provided,
105 ``taskClass`` must be, and ``taskClass.__name__`` is used.
106 config : `lsst.pex.config.Config`, optional
107 Instance of the configuration class corresponding to this task class,
108 usually with all overrides applied. This config will be frozen. If
109 not provided, ``taskClass`` must be provided and
110 ``taskClass.ConfigClass()`` will be used.
111 taskClass : `type`, optional
112 `PipelineTask` class object, can be ``None``. If ``None`` then
113 framework will have to locate and load class.
114 label : `str`, optional
115 Task label, usually a short string unique in a pipeline. If not
116 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will
117 be used.
118 """
119 def __init__(self, taskName=None, config=None, taskClass=None, label=None):
120 if taskName is None:
121 if taskClass is None:
122 raise ValueError("At least one of `taskName` and `taskClass` must be provided.")
123 taskName = taskClass.__name__
124 if config is None:
125 if taskClass is None:
126 raise ValueError("`taskClass` must be provided if `config` is not.")
127 config = taskClass.ConfigClass()
128 if label is None:
129 if taskClass is None:
130 raise ValueError("`taskClass` must be provided if `label` is not.")
131 label = taskClass._DefaultName
132 self.taskName = taskName
133 try:
134 config.validate()
135 except Exception:
136 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
137 raise
138 config.freeze()
139 self.config = config
140 self.taskClass = taskClass
141 self.label = label
142 self.connections = config.connections.ConnectionsClass(config=config)
144 @property
145 def configDatasetName(self) -> str:
146 """Name of a dataset type for configuration of this task (`str`)
147 """
148 return self.label + "_config"
150 @property
151 def metadataDatasetName(self) -> Optional[str]:
152 """Name of a dataset type for metadata of this task, `None` if
153 metadata is not to be saved (`str`)
154 """
155 if self.config.saveMetadata:
156 return self.label + "_metadata"
157 else:
158 return None
160 @property
161 def logOutputDatasetName(self) -> Optional[str]:
162 """Name of a dataset type for log output from this task, `None` if
163 logs are not to be saved (`str`)
164 """
165 if self.config.saveLogOutput:
166 return self.label + "_log"
167 else:
168 return None
170 def __str__(self):
171 rep = "TaskDef(" + self.taskName
172 if self.label:
173 rep += ", label=" + self.label
174 rep += ")"
175 return rep
177 def __eq__(self, other: object) -> bool:
178 if not isinstance(other, TaskDef):
179 return False
180 # This does not consider equality of configs when determining equality
181 # as config equality is a difficult thing to define. Should be updated
182 # after DM-27847
183 return self.taskClass == other.taskClass and self.label == other.label
185 def __hash__(self):
186 return hash((self.taskClass, self.label))
189class Pipeline:
190 """A `Pipeline` is a representation of a series of tasks to run, and the
191 configuration for those tasks.
193 Parameters
194 ----------
195 description : `str`
196 A description of that this pipeline does.
197 """
198 def __init__(self, description: str):
199 pipeline_dict = {"description": description, "tasks": {}}
200 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
202 @classmethod
203 def fromFile(cls, filename: str) -> Pipeline:
204 """Load a pipeline defined in a pipeline yaml file.
206 Parameters
207 ----------
208 filename: `str`
209 A path that points to a pipeline defined in yaml format. This
210 filename may also supply additional labels to be used in
211 subsetting the loaded Pipeline. These labels are separated from
212 the path by a \\#, and may be specified as a comma separated
213 list, or a range denoted as beginning..end. Beginning or end may
214 be empty, in which case the range will be a half open interval.
215 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
216 that range based selection is not well defined for pipelines that
217 are not linear in nature, and correct behavior is not guaranteed,
218 or may vary from run to run.
220 Returns
221 -------
222 pipeline: `Pipeline`
223 The pipeline loaded from specified location with appropriate (if
224 any) subsetting
226 Notes
227 -----
228 This method attempts to prune any contracts that contain labels which
229 are not in the declared subset of labels. This pruning is done using a
230 string based matching due to the nature of contracts and may prune more
231 than it should.
232 """
233 return cls.from_uri(filename)
235 @classmethod
236 def from_uri(cls, uri: Union[str, ButlerURI]) -> Pipeline:
237 """Load a pipeline defined in a pipeline yaml file at a location
238 specified by a URI.
240 Parameters
241 ----------
242 uri: `str` or `ButlerURI`
243 If a string is supplied this should be a URI path that points to a
244 pipeline defined in yaml format. This uri may also supply
245 additional labels to be used in subsetting the loaded Pipeline.
246 These labels are separated from the path by a \\#, and may be
247 specified as a comma separated list, or a range denoted as
248 beginning..end. Beginning or end may be empty, in which case the
249 range will be a half open interval. Unlike python iteration
250 bounds, end bounds are *INCLUDED*. Note that range based selection
251 is not well defined for pipelines that are not linear in nature,
252 and correct behavior is not guaranteed, or may vary from run to
253 run. The same specifiers can be used with a ButlerURI object, by
254 being the sole contents in the fragments attribute.
256 Returns
257 -------
258 pipeline: `Pipeline`
259 The pipeline loaded from specified location with appropriate (if
260 any) subsetting
262 Notes
263 -----
264 This method attempts to prune any contracts that contain labels which
265 are not in the declared subset of labels. This pruning is done using a
266 string based matching due to the nature of contracts and may prune more
267 than it should.
268 """
269 # Split up the uri and any labels that were supplied
270 uri, label_specifier = cls._parse_file_specifier(uri)
271 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
273 # If there are labels supplied, only keep those
274 if label_specifier is not None:
275 pipeline = pipeline.subsetFromLabels(label_specifier)
276 return pipeline
278 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
279 """Subset a pipeline to contain only labels specified in labelSpecifier
281 Parameters
282 ----------
283 labelSpecifier : `labelSpecifier`
284 Object containing labels that describes how to subset a pipeline.
286 Returns
287 -------
288 pipeline : `Pipeline`
289 A new pipeline object that is a subset of the old pipeline
291 Raises
292 ------
293 ValueError
294 Raised if there is an issue with specified labels
296 Notes
297 -----
298 This method attempts to prune any contracts that contain labels which
299 are not in the declared subset of labels. This pruning is done using a
300 string based matching due to the nature of contracts and may prune more
301 than it should.
302 """
303 # Labels supplied as a set
304 if labelSpecifier.labels:
305 labelSet = labelSpecifier.labels
306 # Labels supplied as a range, first create a list of all the labels
307 # in the pipeline sorted according to task dependency. Then only
308 # keep labels that lie between the supplied bounds
309 else:
310 # Create a copy of the pipeline to use when assessing the label
311 # ordering. Use a dict for fast searching while preserving order.
312 # Remove contracts so they do not fail in the expansion step. This
313 # is needed because a user may only configure the tasks they intend
314 # to run, which may cause some contracts to fail if they will later
315 # be dropped
316 pipeline = copy.deepcopy(self)
317 pipeline._pipelineIR.contracts = []
318 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
320 # Verify the bounds are in the labels
321 if labelSpecifier.begin is not None:
322 if labelSpecifier.begin not in labels:
323 raise ValueError(f"Beginning of range subset, {labelSpecifier.begin}, not found in "
324 "pipeline definition")
325 if labelSpecifier.end is not None:
326 if labelSpecifier.end not in labels:
327 raise ValueError(f"End of range subset, {labelSpecifier.end}, not found in pipeline "
328 "definition")
330 labelSet = set()
331 for label in labels:
332 if labelSpecifier.begin is not None:
333 if label != labelSpecifier.begin:
334 continue
335 else:
336 labelSpecifier.begin = None
337 labelSet.add(label)
338 if labelSpecifier.end is not None and label == labelSpecifier.end:
339 break
340 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
342 @staticmethod
343 def _parse_file_specifier(uri: Union[str, ButlerURI]
344 ) -> Tuple[ButlerURI, Optional[LabelSpecifier]]:
345 """Split appart a uri and any possible label subsets
346 """
347 if isinstance(uri, str):
348 # This is to support legacy pipelines during transition
349 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
350 if num_replace:
351 warnings.warn(f"The pipeline file {uri} seems to use the legacy : to separate "
352 "labels, this is deprecated and will be removed after June 2021, please use "
353 "# instead.",
354 category=FutureWarning)
355 if uri.count("#") > 1:
356 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
357 uri = ButlerURI(uri)
358 label_subset = uri.fragment or None
360 specifier: Optional[LabelSpecifier]
361 if label_subset is not None:
362 label_subset = urllib.parse.unquote(label_subset)
363 args: Dict[str, Union[Set[str], str, None]]
364 # labels supplied as a list
365 if ',' in label_subset:
366 if '..' in label_subset:
367 raise ValueError("Can only specify a list of labels or a range"
368 "when loading a Pipline not both")
369 args = {"labels": set(label_subset.split(","))}
370 # labels supplied as a range
371 elif '..' in label_subset:
372 # Try to de-structure the labelSubset, this will fail if more
373 # than one range is specified
374 begin, end, *rest = label_subset.split("..")
375 if rest:
376 raise ValueError("Only one range can be specified when loading a pipeline")
377 args = {"begin": begin if begin else None, "end": end if end else None}
378 # Assume anything else is a single label
379 else:
380 args = {"labels": {label_subset}}
382 specifier = LabelSpecifier(**args)
383 else:
384 specifier = None
386 return uri, specifier
388 @classmethod
389 def fromString(cls, pipeline_string: str) -> Pipeline:
390 """Create a pipeline from string formatted as a pipeline document.
392 Parameters
393 ----------
394 pipeline_string : `str`
395 A string that is formatted according like a pipeline document
397 Returns
398 -------
399 pipeline: `Pipeline`
400 """
401 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
402 return pipeline
404 @classmethod
405 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
406 """Create a pipeline from an already created `PipelineIR` object.
408 Parameters
409 ----------
410 deserialized_pipeline: `PipelineIR`
411 An already created pipeline intermediate representation object
413 Returns
414 -------
415 pipeline: `Pipeline`
416 """
417 pipeline = cls.__new__(cls)
418 pipeline._pipelineIR = deserialized_pipeline
419 return pipeline
421 @classmethod
422 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline:
423 """Create a new pipeline by copying an already existing `Pipeline`.
425 Parameters
426 ----------
427 pipeline: `Pipeline`
428 An already created pipeline intermediate representation object
430 Returns
431 -------
432 pipeline: `Pipeline`
433 """
434 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
436 def __str__(self) -> str:
437 # tasks need sorted each call because someone might have added or
438 # removed task, and caching changes does not seem worth the small
439 # overhead
440 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
441 self._pipelineIR.reorder_tasks(labels)
442 return str(self._pipelineIR)
444 def addInstrument(self, instrument: Union[Instrument, str]) -> None:
445 """Add an instrument to the pipeline, or replace an instrument that is
446 already defined.
448 Parameters
449 ----------
450 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
451 Either a derived class object of a `lsst.daf.butler.instrument` or
452 a string corresponding to a fully qualified
453 `lsst.daf.butler.instrument` name.
454 """
455 if isinstance(instrument, str):
456 pass
457 else:
458 # TODO: assume that this is a subclass of Instrument, no type
459 # checking
460 instrument = f"{instrument.__module__}.{instrument.__qualname__}"
461 self._pipelineIR.instrument = instrument
463 def getInstrument(self) -> Instrument:
464 """Get the instrument from the pipeline.
466 Returns
467 -------
468 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None
469 A derived class object of a `lsst.daf.butler.instrument`, a string
470 corresponding to a fully qualified `lsst.daf.butler.instrument`
471 name, or None if the pipeline does not have an instrument.
472 """
473 return self._pipelineIR.instrument
475 def addTask(self, task: Union[PipelineTask, str], label: str) -> None:
476 """Add a new task to the pipeline, or replace a task that is already
477 associated with the supplied label.
479 Parameters
480 ----------
481 task: `PipelineTask` or `str`
482 Either a derived class object of a `PipelineTask` or a string
483 corresponding to a fully qualified `PipelineTask` name.
484 label: `str`
485 A label that is used to identify the `PipelineTask` being added
486 """
487 if isinstance(task, str):
488 taskName = task
489 elif issubclass(task, PipelineTask):
490 taskName = f"{task.__module__}.{task.__qualname__}"
491 else:
492 raise ValueError("task must be either a child class of PipelineTask or a string containing"
493 " a fully qualified name to one")
494 if not label:
495 # in some cases (with command line-generated pipeline) tasks can
496 # be defined without label which is not acceptable, use task
497 # _DefaultName in that case
498 if isinstance(task, str):
499 task = doImport(task)
500 label = task._DefaultName
501 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
503 def removeTask(self, label: str) -> None:
504 """Remove a task from the pipeline.
506 Parameters
507 ----------
508 label : `str`
509 The label used to identify the task that is to be removed
511 Raises
512 ------
513 KeyError
514 If no task with that label exists in the pipeline
516 """
517 self._pipelineIR.tasks.pop(label)
519 def addConfigOverride(self, label: str, key: str, value: object) -> None:
520 """Apply single config override.
522 Parameters
523 ----------
524 label : `str`
525 Label of the task.
526 key: `str`
527 Fully-qualified field name.
528 value : object
529 Value to be given to a field.
530 """
531 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
533 def addConfigFile(self, label: str, filename: str) -> None:
534 """Add overrides from a specified file.
536 Parameters
537 ----------
538 label : `str`
539 The label used to identify the task associated with config to
540 modify
541 filename : `str`
542 Path to the override file.
543 """
544 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
546 def addConfigPython(self, label: str, pythonString: str) -> None:
547 """Add Overrides by running a snippet of python code against a config.
549 Parameters
550 ----------
551 label : `str`
552 The label used to identity the task associated with config to
553 modify.
554 pythonString: `str`
555 A string which is valid python code to be executed. This is done
556 with config as the only local accessible value.
557 """
558 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
560 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
561 if label == "parameters":
562 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys():
563 raise ValueError("Cannot override parameters that are not defined in pipeline")
564 self._pipelineIR.parameters.mapping.update(newConfig.rest)
565 if newConfig.file:
566 raise ValueError("Setting parameters section with config file is not supported")
567 if newConfig.python:
568 raise ValueError("Setting parameters section using python block in unsupported")
569 return
570 if label not in self._pipelineIR.tasks:
571 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
572 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
574 def toFile(self, filename: str) -> None:
575 self._pipelineIR.to_file(filename)
577 def write_to_uri(self, uri: Union[str, ButlerURI]) -> None:
578 # tasks need sorted each call because someone might have added or
579 # removed task, and caching changes does not seem worth the small
580 # overhead
581 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
582 self._pipelineIR.reorder_tasks(labels)
583 self._pipelineIR.write_to_uri(uri)
585 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
586 """Returns a generator of TaskDefs which can be used to create quantum
587 graphs.
589 Returns
590 -------
591 generator : generator of `TaskDef`
592 The generator returned will be the sorted iterator of tasks which
593 are to be used in constructing a quantum graph.
595 Raises
596 ------
597 NotImplementedError
598 If a dataId is supplied in a config block. This is in place for
599 future use
600 """
601 yield from self._toExpandedPipelineImpl()
603 def _toExpandedPipelineImpl(self, checkContracts=True) -> Iterable[TaskDef]:
604 taskDefs = []
605 for label in self._pipelineIR.tasks:
606 taskDefs.append(self._buildTaskDef(label))
608 # lets evaluate the contracts
609 if self._pipelineIR.contracts is not None:
610 label_to_config = {x.label: x.config for x in taskDefs}
611 for contract in self._pipelineIR.contracts:
612 # execute this in its own line so it can raise a good error
613 # message if there was problems with the eval
614 success = eval(contract.contract, None, label_to_config)
615 if not success:
616 extra_info = f": {contract.msg}" if contract.msg is not None else ""
617 raise pipelineIR.ContractError(f"Contract(s) '{contract.contract}' were not "
618 f"satisfied{extra_info}")
620 taskDefs = sorted(taskDefs, key=lambda x: x.label)
621 yield from pipeTools.orderPipeline(taskDefs)
623 def _buildTaskDef(self, label: str) -> TaskDef:
624 if (taskIR := self._pipelineIR.tasks.get(label)) is None:
625 raise NameError(f"Label {label} does not appear in this pipeline")
626 taskClass = doImport(taskIR.klass)
627 taskName = taskClass.__qualname__
628 config = taskClass.ConfigClass()
629 overrides = ConfigOverrides()
630 if self._pipelineIR.instrument is not None:
631 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName)
632 if taskIR.config is not None:
633 for configIR in (configIr.formatted(self._pipelineIR.parameters)
634 for configIr in taskIR.config):
635 if configIR.dataId is not None:
636 raise NotImplementedError("Specializing a config on a partial data id is not yet "
637 "supported in Pipeline definition")
638 # only apply override if it applies to everything
639 if configIR.dataId is None:
640 if configIR.file:
641 for configFile in configIR.file:
642 overrides.addFileOverride(os.path.expandvars(configFile))
643 if configIR.python is not None:
644 overrides.addPythonOverride(configIR.python)
645 for key, value in configIR.rest.items():
646 overrides.addValueOverride(key, value)
647 overrides.applyTo(config)
648 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)
650 def __iter__(self) -> Generator[TaskDef, None, None]:
651 return self.toExpandedPipeline()
653 def __getitem__(self, item: str) -> TaskDef:
654 return self._buildTaskDef(item)
656 def __len__(self):
657 return len(self._pipelineIR.tasks)
659 def __eq__(self, other: object):
660 if not isinstance(other, Pipeline):
661 return False
662 return self._pipelineIR == other._pipelineIR
665@dataclass(frozen=True)
666class TaskDatasetTypes:
667 """An immutable struct that extracts and classifies the dataset types used
668 by a `PipelineTask`
669 """
671 initInputs: NamedValueSet[DatasetType]
672 """Dataset types that are needed as inputs in order to construct this Task.
674 Task-level `initInputs` may be classified as either
675 `~PipelineDatasetTypes.initInputs` or
676 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
677 """
679 initOutputs: NamedValueSet[DatasetType]
680 """Dataset types that may be written after constructing this Task.
682 Task-level `initOutputs` may be classified as either
683 `~PipelineDatasetTypes.initOutputs` or
684 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
685 """
687 inputs: NamedValueSet[DatasetType]
688 """Dataset types that are regular inputs to this Task.
690 If an input dataset needed for a Quantum cannot be found in the input
691 collection(s) or produced by another Task in the Pipeline, that Quantum
692 (and all dependent Quanta) will not be produced.
694 Task-level `inputs` may be classified as either
695 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
696 at the Pipeline level.
697 """
699 prerequisites: NamedValueSet[DatasetType]
700 """Dataset types that are prerequisite inputs to this Task.
702 Prerequisite inputs must exist in the input collection(s) before the
703 pipeline is run, but do not constrain the graph - if a prerequisite is
704 missing for a Quantum, `PrerequisiteMissingError` is raised.
706 Prerequisite inputs are not resolved until the second stage of
707 QuantumGraph generation.
708 """
710 outputs: NamedValueSet[DatasetType]
711 """Dataset types that are produced by this Task.
713 Task-level `outputs` may be classified as either
714 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
715 at the Pipeline level.
716 """
718 @classmethod
719 def fromTaskDef(
720 cls,
721 taskDef: TaskDef,
722 *,
723 registry: Registry,
724 include_configs: bool = True,
725 storage_class_mapping: Optional[Mapping[str, str]] = None
726 ) -> TaskDatasetTypes:
727 """Extract and classify the dataset types from a single `PipelineTask`.
729 Parameters
730 ----------
731 taskDef: `TaskDef`
732 An instance of a `TaskDef` class for a particular `PipelineTask`.
733 registry: `Registry`
734 Registry used to construct normalized `DatasetType` objects and
735 retrieve those that are incomplete.
736 include_configs : `bool`, optional
737 If `True` (default) include config dataset types as
738 ``initOutputs``.
739 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional
740 If a taskdef contains a component dataset type that is unknown
741 to the registry, its parent StorageClass will be looked up in this
742 mapping if it is supplied. If the mapping does not contain the
743 composite dataset type, or the mapping is not supplied an exception
744 will be raised.
746 Returns
747 -------
748 types: `TaskDatasetTypes`
749 The dataset types used by this task.
751 Raises
752 ------
753 ValueError
754 Raised if dataset type connection definition differs from
755 registry definition.
756 LookupError
757 Raised if component parent StorageClass could not be determined
758 and storage_class_mapping does not contain the composite type, or
759 is set to None.
760 """
761 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]:
762 """Constructs a set of true `DatasetType` objects
764 Parameters
765 ----------
766 connectionType : `str`
767 Name of the connection type to produce a set for, corresponds
768 to an attribute of type `list` on the connection class instance
769 freeze : `bool`, optional
770 If `True`, call `NamedValueSet.freeze` on the object returned.
772 Returns
773 -------
774 datasetTypes : `NamedValueSet`
775 A set of all datasetTypes which correspond to the input
776 connection type specified in the connection class of this
777 `PipelineTask`
779 Raises
780 ------
781 ValueError
782 Raised if dataset type connection definition differs from
783 registry definition.
784 LookupError
785 Raised if component parent StorageClass could not be determined
786 and storage_class_mapping does not contain the composite type,
787 or is set to None.
789 Notes
790 -----
791 This function is a closure over the variables ``registry`` and
792 ``taskDef``, and ``storage_class_mapping``.
793 """
794 datasetTypes = NamedValueSet()
795 for c in iterConnections(taskDef.connections, connectionType):
796 dimensions = set(getattr(c, 'dimensions', set()))
797 if "skypix" in dimensions:
798 try:
799 datasetType = registry.getDatasetType(c.name)
800 except LookupError as err:
801 raise LookupError(
802 f"DatasetType '{c.name}' referenced by "
803 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
804 f"placeholder, but does not already exist in the registry. "
805 f"Note that reference catalog names are now used as the dataset "
806 f"type name instead of 'ref_cat'."
807 ) from err
808 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
809 rest2 = set(dim.name for dim in datasetType.dimensions
810 if not isinstance(dim, SkyPixDimension))
811 if rest1 != rest2:
812 raise ValueError(f"Non-skypix dimensions for dataset type {c.name} declared in "
813 f"connections ({rest1}) are inconsistent with those in "
814 f"registry's version of this dataset ({rest2}).")
815 else:
816 # Component dataset types are not explicitly in the
817 # registry. This complicates consistency checks with
818 # registry and requires we work out the composite storage
819 # class.
820 registryDatasetType = None
821 try:
822 registryDatasetType = registry.getDatasetType(c.name)
823 except KeyError:
824 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
825 if componentName:
826 if storage_class_mapping is None or compositeName not in storage_class_mapping:
827 raise LookupError("Component parent class cannot be determined, and "
828 "composite name was not in storage class mapping, or no "
829 "storage_class_mapping was supplied")
830 else:
831 parentStorageClass = storage_class_mapping[compositeName]
832 else:
833 parentStorageClass = None
834 datasetType = c.makeDatasetType(
835 registry.dimensions,
836 parentStorageClass=parentStorageClass
837 )
838 registryDatasetType = datasetType
839 else:
840 datasetType = c.makeDatasetType(
841 registry.dimensions,
842 parentStorageClass=registryDatasetType.parentStorageClass
843 )
845 if registryDatasetType and datasetType != registryDatasetType:
846 try:
847 # Explicitly check for storage class just to make
848 # more specific message.
849 _ = datasetType.storageClass
850 except KeyError:
851 raise ValueError("Storage class does not exist for supplied dataset type "
852 f"{datasetType} for {taskDef.label}.") from None
853 raise ValueError(f"Supplied dataset type ({datasetType}) inconsistent with "
854 f"registry definition ({registryDatasetType}) "
855 f"for {taskDef.label}.")
856 datasetTypes.add(datasetType)
857 if freeze:
858 datasetTypes.freeze()
859 return datasetTypes
861 # optionally add initOutput dataset for config
862 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False)
863 if include_configs:
864 initOutputs.add(
865 DatasetType(
866 taskDef.configDatasetName,
867 registry.dimensions.empty,
868 storageClass="Config",
869 )
870 )
871 initOutputs.freeze()
873 # optionally add output dataset for metadata
874 outputs = makeDatasetTypesSet("outputs", freeze=False)
875 if taskDef.metadataDatasetName is not None:
876 # Metadata is supposed to be of the PropertySet type, its
877 # dimensions correspond to a task quantum
878 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
879 if _TASK_METADATA_TYPE is TaskMetadata:
880 storageClass = "TaskMetadata"
881 else:
882 storageClass = "PropertySet"
883 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}
884 if taskDef.logOutputDatasetName is not None:
885 # Log output dimensions correspond to a task quantum.
886 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
887 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}
889 outputs.freeze()
891 return cls(
892 initInputs=makeDatasetTypesSet("initInputs"),
893 initOutputs=initOutputs,
894 inputs=makeDatasetTypesSet("inputs"),
895 prerequisites=makeDatasetTypesSet("prerequisiteInputs"),
896 outputs=outputs,
897 )
900@dataclass(frozen=True)
901class PipelineDatasetTypes:
902 """An immutable struct that classifies the dataset types used in a
903 `Pipeline`.
904 """
906 packagesDatasetName: ClassVar[str] = "packages"
907 """Name of a dataset type used to save package versions.
908 """
910 initInputs: NamedValueSet[DatasetType]
911 """Dataset types that are needed as inputs in order to construct the Tasks
912 in this Pipeline.
914 This does not include dataset types that are produced when constructing
915 other Tasks in the Pipeline (these are classified as `initIntermediates`).
916 """
918 initOutputs: NamedValueSet[DatasetType]
919 """Dataset types that may be written after constructing the Tasks in this
920 Pipeline.
922 This does not include dataset types that are also used as inputs when
923 constructing other Tasks in the Pipeline (these are classified as
924 `initIntermediates`).
925 """
927 initIntermediates: NamedValueSet[DatasetType]
928 """Dataset types that are both used when constructing one or more Tasks
929 in the Pipeline and produced as a side-effect of constructing another
930 Task in the Pipeline.
931 """
933 inputs: NamedValueSet[DatasetType]
934 """Dataset types that are regular inputs for the full pipeline.
936 If an input dataset needed for a Quantum cannot be found in the input
937 collection(s), that Quantum (and all dependent Quanta) will not be
938 produced.
939 """
941 prerequisites: NamedValueSet[DatasetType]
942 """Dataset types that are prerequisite inputs for the full Pipeline.
944 Prerequisite inputs must exist in the input collection(s) before the
945 pipeline is run, but do not constrain the graph - if a prerequisite is
946 missing for a Quantum, `PrerequisiteMissingError` is raised.
948 Prerequisite inputs are not resolved until the second stage of
949 QuantumGraph generation.
950 """
952 intermediates: NamedValueSet[DatasetType]
953 """Dataset types that are output by one Task in the Pipeline and consumed
954 as inputs by one or more other Tasks in the Pipeline.
955 """
957 outputs: NamedValueSet[DatasetType]
958 """Dataset types that are output by a Task in the Pipeline and not consumed
959 by any other Task in the Pipeline.
960 """
962 byTask: Mapping[str, TaskDatasetTypes]
963 """Per-Task dataset types, keyed by label in the `Pipeline`.
965 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
966 neither has been modified since the dataset types were extracted, of
967 course).
968 """
970 @classmethod
971 def fromPipeline(
972 cls,
973 pipeline: Union[Pipeline, Iterable[TaskDef]],
974 *,
975 registry: Registry,
976 include_configs: bool = True,
977 include_packages: bool = True,
978 ) -> PipelineDatasetTypes:
979 """Extract and classify the dataset types from all tasks in a
980 `Pipeline`.
982 Parameters
983 ----------
984 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
985 A collection of tasks that can be run together.
986 registry: `Registry`
987 Registry used to construct normalized `DatasetType` objects and
988 retrieve those that are incomplete.
989 include_configs : `bool`, optional
990 If `True` (default) include config dataset types as
991 ``initOutputs``.
992 include_packages : `bool`, optional
993 If `True` (default) include the dataset type for software package
994 versions in ``initOutputs``.
996 Returns
997 -------
998 types: `PipelineDatasetTypes`
999 The dataset types used by this `Pipeline`.
1001 Raises
1002 ------
1003 ValueError
1004 Raised if Tasks are inconsistent about which datasets are marked
1005 prerequisite. This indicates that the Tasks cannot be run as part
1006 of the same `Pipeline`.
1007 """
1008 allInputs = NamedValueSet()
1009 allOutputs = NamedValueSet()
1010 allInitInputs = NamedValueSet()
1011 allInitOutputs = NamedValueSet()
1012 prerequisites = NamedValueSet()
1013 byTask = dict()
1014 if include_packages:
1015 allInitOutputs.add(
1016 DatasetType(
1017 cls.packagesDatasetName,
1018 registry.dimensions.empty,
1019 storageClass="Packages",
1020 )
1021 )
1022 # create a list of TaskDefs in case the input is a generator
1023 pipeline = list(pipeline)
1025 # collect all the output dataset types
1026 typeStorageclassMap: Dict[str, str] = {}
1027 for taskDef in pipeline:
1028 for outConnection in iterConnections(taskDef.connections, 'outputs'):
1029 typeStorageclassMap[outConnection.name] = outConnection.storageClass
1031 for taskDef in pipeline:
1032 thisTask = TaskDatasetTypes.fromTaskDef(
1033 taskDef,
1034 registry=registry,
1035 include_configs=include_configs,
1036 storage_class_mapping=typeStorageclassMap
1037 )
1038 allInitInputs |= thisTask.initInputs
1039 allInitOutputs |= thisTask.initOutputs
1040 allInputs |= thisTask.inputs
1041 prerequisites |= thisTask.prerequisites
1042 allOutputs |= thisTask.outputs
1043 byTask[taskDef.label] = thisTask
1044 if not prerequisites.isdisjoint(allInputs):
1045 raise ValueError("{} marked as both prerequisites and regular inputs".format(
1046 {dt.name for dt in allInputs & prerequisites}
1047 ))
1048 if not prerequisites.isdisjoint(allOutputs):
1049 raise ValueError("{} marked as both prerequisites and outputs".format(
1050 {dt.name for dt in allOutputs & prerequisites}
1051 ))
1052 # Make sure that components which are marked as inputs get treated as
1053 # intermediates if there is an output which produces the composite
1054 # containing the component
1055 intermediateComponents = NamedValueSet()
1056 intermediateComposites = NamedValueSet()
1057 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
1058 for dsType in allInputs:
1059 # get the name of a possible component
1060 name, component = dsType.nameAndComponent()
1061 # if there is a component name, that means this is a component
1062 # DatasetType, if there is an output which produces the parent of
1063 # this component, treat this input as an intermediate
1064 if component is not None:
1065 # This needs to be in this if block, because someone might have
1066 # a composite that is a pure input from existing data
1067 if name in outputNameMapping:
1068 intermediateComponents.add(dsType)
1069 intermediateComposites.add(outputNameMapping[name])
1071 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
1072 common = a.names & b.names
1073 for name in common:
1074 if a[name] != b[name]:
1075 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
1077 checkConsistency(allInitInputs, allInitOutputs)
1078 checkConsistency(allInputs, allOutputs)
1079 checkConsistency(allInputs, intermediateComposites)
1080 checkConsistency(allOutputs, intermediateComposites)
1082 def frozen(s: NamedValueSet) -> NamedValueSet:
1083 s.freeze()
1084 return s
1086 return cls(
1087 initInputs=frozen(allInitInputs - allInitOutputs),
1088 initIntermediates=frozen(allInitInputs & allInitOutputs),
1089 initOutputs=frozen(allInitOutputs - allInitInputs),
1090 inputs=frozen(allInputs - allOutputs - intermediateComponents),
1091 intermediates=frozen(allInputs & allOutputs | intermediateComponents),
1092 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1093 prerequisites=frozen(prerequisites),
1094 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1095 )
1097 @classmethod
1098 def initOutputNames(cls, pipeline: Union[Pipeline, Iterable[TaskDef]], *,
1099 include_configs: bool = True, include_packages: bool = True) -> Iterator[str]:
1100 """Return the names of dataset types ot task initOutputs, Configs,
1101 and package versions for a pipeline.
1103 Parameters
1104 ----------
1105 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1106 A `Pipeline` instance or collection of `TaskDef` instances.
1107 include_configs : `bool`, optional
1108 If `True` (default) include config dataset types.
1109 include_packages : `bool`, optional
1110 If `True` (default) include the dataset type for package versions.
1112 Yields
1113 ------
1114 datasetTypeName : `str`
1115 Name of the dataset type.
1116 """
1117 if include_packages:
1118 # Package versions dataset type
1119 yield cls.packagesDatasetName
1121 if isinstance(pipeline, Pipeline):
1122 pipeline = pipeline.toExpandedPipeline()
1124 for taskDef in pipeline:
1126 # all task InitOutputs
1127 for name in taskDef.connections.initOutputs:
1128 attribute = getattr(taskDef.connections, name)
1129 yield attribute.name
1131 # config dataset name
1132 if include_configs:
1133 yield taskDef.configDatasetName