Coverage for python/lsst/pipe/base/pipeline.py: 19%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining Pipeline class and related methods.
24"""
26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31from dataclasses import dataclass
32import logging
33from types import MappingProxyType
34from typing import (ClassVar, Dict, Iterable, Iterator, Mapping, Set, Union,
35 Generator, TYPE_CHECKING, Optional, Tuple)
37import copy
38import re
39import os
40import urllib.parse
41import warnings
43# -----------------------------
44# Imports for other modules --
45from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension, ButlerURI
46from lsst.utils import doImport
47from .configOverrides import ConfigOverrides
48from .connections import iterConnections
49from .pipelineTask import PipelineTask
50from .task import _TASK_METADATA_TYPE
51from ._task_metadata import TaskMetadata
53from . import pipelineIR
54from . import pipeTools
56if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true
57 from lsst.obs.base import Instrument
59# ----------------------------------
60# Local non-exported definitions --
61# ----------------------------------
63_LOG = logging.getLogger(__name__)
65# ------------------------
66# Exported definitions --
67# ------------------------
70@dataclass
71class LabelSpecifier:
72 """A structure to specify a subset of labels to load
74 This structure may contain a set of labels to be used in subsetting a
75 pipeline, or a beginning and end point. Beginning or end may be empty,
76 in which case the range will be a half open interval. Unlike python
77 iteration bounds, end bounds are *INCLUDED*. Note that range based
78 selection is not well defined for pipelines that are not linear in nature,
79 and correct behavior is not guaranteed, or may vary from run to run.
80 """
81 labels: Optional[Set[str]] = None
82 begin: Optional[str] = None
83 end: Optional[str] = None
85 def __post_init__(self):
86 if self.labels is not None and (self.begin or self.end):
87 raise ValueError("This struct can only be initialized with a labels set or "
88 "a begin (and/or) end specifier")
91class TaskDef:
92 """TaskDef is a collection of information about task needed by Pipeline.
94 The information includes task name, configuration object and optional
95 task class. This class is just a collection of attributes and it exposes
96 all of them so that attributes could potentially be modified in place
97 (e.g. if configuration needs extra overrides).
99 Attributes
100 ----------
101 taskName : `str`, optional
102 `PipelineTask` class name, currently it is not specified whether this
103 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
104 Framework should be prepared to handle all cases. If not provided,
105 ``taskClass`` must be, and ``taskClass.__name__`` is used.
106 config : `lsst.pex.config.Config`, optional
107 Instance of the configuration class corresponding to this task class,
108 usually with all overrides applied. This config will be frozen. If
109 not provided, ``taskClass`` must be provided and
110 ``taskClass.ConfigClass()`` will be used.
111 taskClass : `type`, optional
112 `PipelineTask` class object, can be ``None``. If ``None`` then
113 framework will have to locate and load class.
114 label : `str`, optional
115 Task label, usually a short string unique in a pipeline. If not
116 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will
117 be used.
118 """
119 def __init__(self, taskName=None, config=None, taskClass=None, label=None):
120 if taskName is None:
121 if taskClass is None:
122 raise ValueError("At least one of `taskName` and `taskClass` must be provided.")
123 taskName = taskClass.__name__
124 if config is None:
125 if taskClass is None:
126 raise ValueError("`taskClass` must be provided if `config` is not.")
127 config = taskClass.ConfigClass()
128 if label is None:
129 if taskClass is None:
130 raise ValueError("`taskClass` must be provided if `label` is not.")
131 label = taskClass._DefaultName
132 self.taskName = taskName
133 config.freeze()
134 self.config = config
135 self.taskClass = taskClass
136 self.label = label
137 self.connections = config.connections.ConnectionsClass(config=config)
139 @property
140 def configDatasetName(self) -> str:
141 """Name of a dataset type for configuration of this task (`str`)
142 """
143 return self.label + "_config"
145 @property
146 def metadataDatasetName(self) -> Optional[str]:
147 """Name of a dataset type for metadata of this task, `None` if
148 metadata is not to be saved (`str`)
149 """
150 if self.config.saveMetadata:
151 return self.label + "_metadata"
152 else:
153 return None
155 @property
156 def logOutputDatasetName(self) -> Optional[str]:
157 """Name of a dataset type for log output from this task, `None` if
158 logs are not to be saved (`str`)
159 """
160 if self.config.saveLogOutput:
161 return self.label + "_log"
162 else:
163 return None
165 def __str__(self):
166 rep = "TaskDef(" + self.taskName
167 if self.label:
168 rep += ", label=" + self.label
169 rep += ")"
170 return rep
172 def __eq__(self, other: object) -> bool:
173 if not isinstance(other, TaskDef):
174 return False
175 # This does not consider equality of configs when determining equality
176 # as config equality is a difficult thing to define. Should be updated
177 # after DM-27847
178 return self.taskClass == other.taskClass and self.label == other.label
180 def __hash__(self):
181 return hash((self.taskClass, self.label))
184class Pipeline:
185 """A `Pipeline` is a representation of a series of tasks to run, and the
186 configuration for those tasks.
188 Parameters
189 ----------
190 description : `str`
191 A description of that this pipeline does.
192 """
193 def __init__(self, description: str):
194 pipeline_dict = {"description": description, "tasks": {}}
195 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
197 @classmethod
198 def fromFile(cls, filename: str) -> Pipeline:
199 """Load a pipeline defined in a pipeline yaml file.
201 Parameters
202 ----------
203 filename: `str`
204 A path that points to a pipeline defined in yaml format. This
205 filename may also supply additional labels to be used in
206 subsetting the loaded Pipeline. These labels are separated from
207 the path by a \\#, and may be specified as a comma separated
208 list, or a range denoted as beginning..end. Beginning or end may
209 be empty, in which case the range will be a half open interval.
210 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
211 that range based selection is not well defined for pipelines that
212 are not linear in nature, and correct behavior is not guaranteed,
213 or may vary from run to run.
215 Returns
216 -------
217 pipeline: `Pipeline`
218 The pipeline loaded from specified location with appropriate (if
219 any) subsetting
221 Notes
222 -----
223 This method attempts to prune any contracts that contain labels which
224 are not in the declared subset of labels. This pruning is done using a
225 string based matching due to the nature of contracts and may prune more
226 than it should.
227 """
228 return cls.from_uri(filename)
230 @classmethod
231 def from_uri(cls, uri: Union[str, ButlerURI]) -> Pipeline:
232 """Load a pipeline defined in a pipeline yaml file at a location
233 specified by a URI.
235 Parameters
236 ----------
237 uri: `str` or `ButlerURI`
238 If a string is supplied this should be a URI path that points to a
239 pipeline defined in yaml format. This uri may also supply
240 additional labels to be used in subsetting the loaded Pipeline.
241 These labels are separated from the path by a \\#, and may be
242 specified as a comma separated list, or a range denoted as
243 beginning..end. Beginning or end may be empty, in which case the
244 range will be a half open interval. Unlike python iteration
245 bounds, end bounds are *INCLUDED*. Note that range based selection
246 is not well defined for pipelines that are not linear in nature,
247 and correct behavior is not guaranteed, or may vary from run to
248 run. The same specifiers can be used with a ButlerURI object, by
249 being the sole contents in the fragments attribute.
251 Returns
252 -------
253 pipeline: `Pipeline`
254 The pipeline loaded from specified location with appropriate (if
255 any) subsetting
257 Notes
258 -----
259 This method attempts to prune any contracts that contain labels which
260 are not in the declared subset of labels. This pruning is done using a
261 string based matching due to the nature of contracts and may prune more
262 than it should.
263 """
264 # Split up the uri and any labels that were supplied
265 uri, label_specifier = cls._parse_file_specifier(uri)
266 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
268 # If there are labels supplied, only keep those
269 if label_specifier is not None:
270 pipeline = pipeline.subsetFromLabels(label_specifier)
271 return pipeline
273 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
274 """Subset a pipeline to contain only labels specified in labelSpecifier
276 Parameters
277 ----------
278 labelSpecifier : `labelSpecifier`
279 Object containing labels that describes how to subset a pipeline.
281 Returns
282 -------
283 pipeline : `Pipeline`
284 A new pipeline object that is a subset of the old pipeline
286 Raises
287 ------
288 ValueError
289 Raised if there is an issue with specified labels
291 Notes
292 -----
293 This method attempts to prune any contracts that contain labels which
294 are not in the declared subset of labels. This pruning is done using a
295 string based matching due to the nature of contracts and may prune more
296 than it should.
297 """
298 # Labels supplied as a set
299 if labelSpecifier.labels:
300 labelSet = labelSpecifier.labels
301 # Labels supplied as a range, first create a list of all the labels
302 # in the pipeline sorted according to task dependency. Then only
303 # keep labels that lie between the supplied bounds
304 else:
305 # Create a copy of the pipeline to use when assessing the label
306 # ordering. Use a dict for fast searching while preserving order.
307 # Remove contracts so they do not fail in the expansion step. This
308 # is needed because a user may only configure the tasks they intend
309 # to run, which may cause some contracts to fail if they will later
310 # be dropped
311 pipeline = copy.deepcopy(self)
312 pipeline._pipelineIR.contracts = []
313 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
315 # Verify the bounds are in the labels
316 if labelSpecifier.begin is not None:
317 if labelSpecifier.begin not in labels:
318 raise ValueError(f"Beginning of range subset, {labelSpecifier.begin}, not found in "
319 "pipeline definition")
320 if labelSpecifier.end is not None:
321 if labelSpecifier.end not in labels:
322 raise ValueError(f"End of range subset, {labelSpecifier.end}, not found in pipeline "
323 "definition")
325 labelSet = set()
326 for label in labels:
327 if labelSpecifier.begin is not None:
328 if label != labelSpecifier.begin:
329 continue
330 else:
331 labelSpecifier.begin = None
332 labelSet.add(label)
333 if labelSpecifier.end is not None and label == labelSpecifier.end:
334 break
335 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
337 @staticmethod
338 def _parse_file_specifier(uri: Union[str, ButlerURI]
339 ) -> Tuple[ButlerURI, Optional[LabelSpecifier]]:
340 """Split appart a uri and any possible label subsets
341 """
342 if isinstance(uri, str):
343 # This is to support legacy pipelines during transition
344 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
345 if num_replace:
346 warnings.warn(f"The pipeline file {uri} seems to use the legacy : to separate "
347 "labels, this is deprecated and will be removed after June 2021, please use "
348 "# instead.",
349 category=FutureWarning)
350 if uri.count("#") > 1:
351 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
352 uri = ButlerURI(uri)
353 label_subset = uri.fragment or None
355 specifier: Optional[LabelSpecifier]
356 if label_subset is not None:
357 label_subset = urllib.parse.unquote(label_subset)
358 args: Dict[str, Union[Set[str], str, None]]
359 # labels supplied as a list
360 if ',' in label_subset:
361 if '..' in label_subset:
362 raise ValueError("Can only specify a list of labels or a range"
363 "when loading a Pipline not both")
364 args = {"labels": set(label_subset.split(","))}
365 # labels supplied as a range
366 elif '..' in label_subset:
367 # Try to de-structure the labelSubset, this will fail if more
368 # than one range is specified
369 begin, end, *rest = label_subset.split("..")
370 if rest:
371 raise ValueError("Only one range can be specified when loading a pipeline")
372 args = {"begin": begin if begin else None, "end": end if end else None}
373 # Assume anything else is a single label
374 else:
375 args = {"labels": {label_subset}}
377 specifier = LabelSpecifier(**args)
378 else:
379 specifier = None
381 return uri, specifier
383 @classmethod
384 def fromString(cls, pipeline_string: str) -> Pipeline:
385 """Create a pipeline from string formatted as a pipeline document.
387 Parameters
388 ----------
389 pipeline_string : `str`
390 A string that is formatted according like a pipeline document
392 Returns
393 -------
394 pipeline: `Pipeline`
395 """
396 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
397 return pipeline
399 @classmethod
400 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
401 """Create a pipeline from an already created `PipelineIR` object.
403 Parameters
404 ----------
405 deserialized_pipeline: `PipelineIR`
406 An already created pipeline intermediate representation object
408 Returns
409 -------
410 pipeline: `Pipeline`
411 """
412 pipeline = cls.__new__(cls)
413 pipeline._pipelineIR = deserialized_pipeline
414 return pipeline
416 @classmethod
417 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline:
418 """Create a new pipeline by copying an already existing `Pipeline`.
420 Parameters
421 ----------
422 pipeline: `Pipeline`
423 An already created pipeline intermediate representation object
425 Returns
426 -------
427 pipeline: `Pipeline`
428 """
429 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
431 def __str__(self) -> str:
432 # tasks need sorted each call because someone might have added or
433 # removed task, and caching changes does not seem worth the small
434 # overhead
435 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
436 self._pipelineIR.reorder_tasks(labels)
437 return str(self._pipelineIR)
439 def addInstrument(self, instrument: Union[Instrument, str]) -> None:
440 """Add an instrument to the pipeline, or replace an instrument that is
441 already defined.
443 Parameters
444 ----------
445 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
446 Either a derived class object of a `lsst.daf.butler.instrument` or
447 a string corresponding to a fully qualified
448 `lsst.daf.butler.instrument` name.
449 """
450 if isinstance(instrument, str):
451 pass
452 else:
453 # TODO: assume that this is a subclass of Instrument, no type
454 # checking
455 instrument = f"{instrument.__module__}.{instrument.__qualname__}"
456 self._pipelineIR.instrument = instrument
458 def getInstrument(self) -> Instrument:
459 """Get the instrument from the pipeline.
461 Returns
462 -------
463 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None
464 A derived class object of a `lsst.daf.butler.instrument`, a string
465 corresponding to a fully qualified `lsst.daf.butler.instrument`
466 name, or None if the pipeline does not have an instrument.
467 """
468 return self._pipelineIR.instrument
470 def addTask(self, task: Union[PipelineTask, str], label: str) -> None:
471 """Add a new task to the pipeline, or replace a task that is already
472 associated with the supplied label.
474 Parameters
475 ----------
476 task: `PipelineTask` or `str`
477 Either a derived class object of a `PipelineTask` or a string
478 corresponding to a fully qualified `PipelineTask` name.
479 label: `str`
480 A label that is used to identify the `PipelineTask` being added
481 """
482 if isinstance(task, str):
483 taskName = task
484 elif issubclass(task, PipelineTask):
485 taskName = f"{task.__module__}.{task.__qualname__}"
486 else:
487 raise ValueError("task must be either a child class of PipelineTask or a string containing"
488 " a fully qualified name to one")
489 if not label:
490 # in some cases (with command line-generated pipeline) tasks can
491 # be defined without label which is not acceptable, use task
492 # _DefaultName in that case
493 if isinstance(task, str):
494 task = doImport(task)
495 label = task._DefaultName
496 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
498 def removeTask(self, label: str) -> None:
499 """Remove a task from the pipeline.
501 Parameters
502 ----------
503 label : `str`
504 The label used to identify the task that is to be removed
506 Raises
507 ------
508 KeyError
509 If no task with that label exists in the pipeline
511 """
512 self._pipelineIR.tasks.pop(label)
514 def addConfigOverride(self, label: str, key: str, value: object) -> None:
515 """Apply single config override.
517 Parameters
518 ----------
519 label : `str`
520 Label of the task.
521 key: `str`
522 Fully-qualified field name.
523 value : object
524 Value to be given to a field.
525 """
526 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
528 def addConfigFile(self, label: str, filename: str) -> None:
529 """Add overrides from a specified file.
531 Parameters
532 ----------
533 label : `str`
534 The label used to identify the task associated with config to
535 modify
536 filename : `str`
537 Path to the override file.
538 """
539 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
541 def addConfigPython(self, label: str, pythonString: str) -> None:
542 """Add Overrides by running a snippet of python code against a config.
544 Parameters
545 ----------
546 label : `str`
547 The label used to identity the task associated with config to
548 modify.
549 pythonString: `str`
550 A string which is valid python code to be executed. This is done
551 with config as the only local accessible value.
552 """
553 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
555 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
556 if label == "parameters":
557 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys():
558 raise ValueError("Cannot override parameters that are not defined in pipeline")
559 self._pipelineIR.parameters.mapping.update(newConfig.rest)
560 if newConfig.file:
561 raise ValueError("Setting parameters section with config file is not supported")
562 if newConfig.python:
563 raise ValueError("Setting parameters section using python block in unsupported")
564 return
565 if label not in self._pipelineIR.tasks:
566 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
567 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
569 def toFile(self, filename: str) -> None:
570 self._pipelineIR.to_file(filename)
572 def write_to_uri(self, uri: Union[str, ButlerURI]) -> None:
573 # tasks need sorted each call because someone might have added or
574 # removed task, and caching changes does not seem worth the small
575 # overhead
576 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
577 self._pipelineIR.reorder_tasks(labels)
578 self._pipelineIR.write_to_uri(uri)
580 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
581 """Returns a generator of TaskDefs which can be used to create quantum
582 graphs.
584 Returns
585 -------
586 generator : generator of `TaskDef`
587 The generator returned will be the sorted iterator of tasks which
588 are to be used in constructing a quantum graph.
590 Raises
591 ------
592 NotImplementedError
593 If a dataId is supplied in a config block. This is in place for
594 future use
595 """
596 yield from self._toExpandedPipelineImpl()
598 def _toExpandedPipelineImpl(self, checkContracts=True) -> Iterable[TaskDef]:
599 taskDefs = []
600 for label in self._pipelineIR.tasks:
601 taskDefs.append(self._buildTaskDef(label))
603 # lets evaluate the contracts
604 if self._pipelineIR.contracts is not None:
605 label_to_config = {x.label: x.config for x in taskDefs}
606 for contract in self._pipelineIR.contracts:
607 # execute this in its own line so it can raise a good error
608 # message if there was problems with the eval
609 success = eval(contract.contract, None, label_to_config)
610 if not success:
611 extra_info = f": {contract.msg}" if contract.msg is not None else ""
612 raise pipelineIR.ContractError(f"Contract(s) '{contract.contract}' were not "
613 f"satisfied{extra_info}")
615 taskDefs = sorted(taskDefs, key=lambda x: x.label)
616 yield from pipeTools.orderPipeline(taskDefs)
618 def _buildTaskDef(self, label: str) -> TaskDef:
619 if (taskIR := self._pipelineIR.tasks.get(label)) is None:
620 raise NameError(f"Label {label} does not appear in this pipeline")
621 taskClass = doImport(taskIR.klass)
622 taskName = taskClass.__qualname__
623 config = taskClass.ConfigClass()
624 overrides = ConfigOverrides()
625 if self._pipelineIR.instrument is not None:
626 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName)
627 if taskIR.config is not None:
628 for configIR in (configIr.formatted(self._pipelineIR.parameters)
629 for configIr in taskIR.config):
630 if configIR.dataId is not None:
631 raise NotImplementedError("Specializing a config on a partial data id is not yet "
632 "supported in Pipeline definition")
633 # only apply override if it applies to everything
634 if configIR.dataId is None:
635 if configIR.file:
636 for configFile in configIR.file:
637 overrides.addFileOverride(os.path.expandvars(configFile))
638 if configIR.python is not None:
639 overrides.addPythonOverride(configIR.python)
640 for key, value in configIR.rest.items():
641 overrides.addValueOverride(key, value)
642 overrides.applyTo(config)
643 # This may need to be revisited
644 try:
645 config.validate()
646 except Exception:
647 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
648 raise
649 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)
651 def __iter__(self) -> Generator[TaskDef, None, None]:
652 return self.toExpandedPipeline()
654 def __getitem__(self, item: str) -> TaskDef:
655 return self._buildTaskDef(item)
657 def __len__(self):
658 return len(self._pipelineIR.tasks)
660 def __eq__(self, other: object):
661 if not isinstance(other, Pipeline):
662 return False
663 return self._pipelineIR == other._pipelineIR
666@dataclass(frozen=True)
667class TaskDatasetTypes:
668 """An immutable struct that extracts and classifies the dataset types used
669 by a `PipelineTask`
670 """
672 initInputs: NamedValueSet[DatasetType]
673 """Dataset types that are needed as inputs in order to construct this Task.
675 Task-level `initInputs` may be classified as either
676 `~PipelineDatasetTypes.initInputs` or
677 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
678 """
680 initOutputs: NamedValueSet[DatasetType]
681 """Dataset types that may be written after constructing this Task.
683 Task-level `initOutputs` may be classified as either
684 `~PipelineDatasetTypes.initOutputs` or
685 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
686 """
688 inputs: NamedValueSet[DatasetType]
689 """Dataset types that are regular inputs to this Task.
691 If an input dataset needed for a Quantum cannot be found in the input
692 collection(s) or produced by another Task in the Pipeline, that Quantum
693 (and all dependent Quanta) will not be produced.
695 Task-level `inputs` may be classified as either
696 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
697 at the Pipeline level.
698 """
700 prerequisites: NamedValueSet[DatasetType]
701 """Dataset types that are prerequisite inputs to this Task.
703 Prerequisite inputs must exist in the input collection(s) before the
704 pipeline is run, but do not constrain the graph - if a prerequisite is
705 missing for a Quantum, `PrerequisiteMissingError` is raised.
707 Prerequisite inputs are not resolved until the second stage of
708 QuantumGraph generation.
709 """
711 outputs: NamedValueSet[DatasetType]
712 """Dataset types that are produced by this Task.
714 Task-level `outputs` may be classified as either
715 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
716 at the Pipeline level.
717 """
719 @classmethod
720 def fromTaskDef(
721 cls,
722 taskDef: TaskDef,
723 *,
724 registry: Registry,
725 include_configs: bool = True,
726 storage_class_mapping: Optional[Mapping[str, str]] = None
727 ) -> TaskDatasetTypes:
728 """Extract and classify the dataset types from a single `PipelineTask`.
730 Parameters
731 ----------
732 taskDef: `TaskDef`
733 An instance of a `TaskDef` class for a particular `PipelineTask`.
734 registry: `Registry`
735 Registry used to construct normalized `DatasetType` objects and
736 retrieve those that are incomplete.
737 include_configs : `bool`, optional
738 If `True` (default) include config dataset types as
739 ``initOutputs``.
740 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional
741 If a taskdef contains a component dataset type that is unknown
742 to the registry, its parent StorageClass will be looked up in this
743 mapping if it is supplied. If the mapping does not contain the
744 composite dataset type, or the mapping is not supplied an exception
745 will be raised.
747 Returns
748 -------
749 types: `TaskDatasetTypes`
750 The dataset types used by this task.
752 Raises
753 ------
754 ValueError
755 Raised if dataset type connection definition differs from
756 registry definition.
757 LookupError
758 Raised if component parent StorageClass could not be determined
759 and storage_class_mapping does not contain the composite type, or
760 is set to None.
761 """
762 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]:
763 """Constructs a set of true `DatasetType` objects
765 Parameters
766 ----------
767 connectionType : `str`
768 Name of the connection type to produce a set for, corresponds
769 to an attribute of type `list` on the connection class instance
770 freeze : `bool`, optional
771 If `True`, call `NamedValueSet.freeze` on the object returned.
773 Returns
774 -------
775 datasetTypes : `NamedValueSet`
776 A set of all datasetTypes which correspond to the input
777 connection type specified in the connection class of this
778 `PipelineTask`
780 Raises
781 ------
782 ValueError
783 Raised if dataset type connection definition differs from
784 registry definition.
785 LookupError
786 Raised if component parent StorageClass could not be determined
787 and storage_class_mapping does not contain the composite type,
788 or is set to None.
790 Notes
791 -----
792 This function is a closure over the variables ``registry`` and
793 ``taskDef``, and ``storage_class_mapping``.
794 """
795 datasetTypes = NamedValueSet()
796 for c in iterConnections(taskDef.connections, connectionType):
797 dimensions = set(getattr(c, 'dimensions', set()))
798 if "skypix" in dimensions:
799 try:
800 datasetType = registry.getDatasetType(c.name)
801 except LookupError as err:
802 raise LookupError(
803 f"DatasetType '{c.name}' referenced by "
804 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
805 f"placeholder, but does not already exist in the registry. "
806 f"Note that reference catalog names are now used as the dataset "
807 f"type name instead of 'ref_cat'."
808 ) from err
809 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
810 rest2 = set(dim.name for dim in datasetType.dimensions
811 if not isinstance(dim, SkyPixDimension))
812 if rest1 != rest2:
813 raise ValueError(f"Non-skypix dimensions for dataset type {c.name} declared in "
814 f"connections ({rest1}) are inconsistent with those in "
815 f"registry's version of this dataset ({rest2}).")
816 else:
817 # Component dataset types are not explicitly in the
818 # registry. This complicates consistency checks with
819 # registry and requires we work out the composite storage
820 # class.
821 registryDatasetType = None
822 try:
823 registryDatasetType = registry.getDatasetType(c.name)
824 except KeyError:
825 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
826 if componentName:
827 if storage_class_mapping is None or compositeName not in storage_class_mapping:
828 raise LookupError("Component parent class cannot be determined, and "
829 "composite name was not in storage class mapping, or no "
830 "storage_class_mapping was supplied")
831 else:
832 parentStorageClass = storage_class_mapping[compositeName]
833 else:
834 parentStorageClass = None
835 datasetType = c.makeDatasetType(
836 registry.dimensions,
837 parentStorageClass=parentStorageClass
838 )
839 registryDatasetType = datasetType
840 else:
841 datasetType = c.makeDatasetType(
842 registry.dimensions,
843 parentStorageClass=registryDatasetType.parentStorageClass
844 )
846 if registryDatasetType and datasetType != registryDatasetType:
847 try:
848 # Explicitly check for storage class just to make
849 # more specific message.
850 _ = datasetType.storageClass
851 except KeyError:
852 raise ValueError("Storage class does not exist for supplied dataset type "
853 f"{datasetType} for {taskDef.label}.") from None
854 raise ValueError(f"Supplied dataset type ({datasetType}) inconsistent with "
855 f"registry definition ({registryDatasetType}) "
856 f"for {taskDef.label}.")
857 datasetTypes.add(datasetType)
858 if freeze:
859 datasetTypes.freeze()
860 return datasetTypes
862 # optionally add initOutput dataset for config
863 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False)
864 if include_configs:
865 initOutputs.add(
866 DatasetType(
867 taskDef.configDatasetName,
868 registry.dimensions.empty,
869 storageClass="Config",
870 )
871 )
872 initOutputs.freeze()
874 # optionally add output dataset for metadata
875 outputs = makeDatasetTypesSet("outputs", freeze=False)
876 if taskDef.metadataDatasetName is not None:
877 # Metadata is supposed to be of the PropertySet type, its
878 # dimensions correspond to a task quantum
879 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
880 if _TASK_METADATA_TYPE is TaskMetadata:
881 storageClass = "TaskMetadata"
882 else:
883 storageClass = "PropertySet"
884 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}
885 if taskDef.logOutputDatasetName is not None:
886 # Log output dimensions correspond to a task quantum.
887 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
888 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}
890 outputs.freeze()
892 return cls(
893 initInputs=makeDatasetTypesSet("initInputs"),
894 initOutputs=initOutputs,
895 inputs=makeDatasetTypesSet("inputs"),
896 prerequisites=makeDatasetTypesSet("prerequisiteInputs"),
897 outputs=outputs,
898 )
901@dataclass(frozen=True)
902class PipelineDatasetTypes:
903 """An immutable struct that classifies the dataset types used in a
904 `Pipeline`.
905 """
907 packagesDatasetName: ClassVar[str] = "packages"
908 """Name of a dataset type used to save package versions.
909 """
911 initInputs: NamedValueSet[DatasetType]
912 """Dataset types that are needed as inputs in order to construct the Tasks
913 in this Pipeline.
915 This does not include dataset types that are produced when constructing
916 other Tasks in the Pipeline (these are classified as `initIntermediates`).
917 """
919 initOutputs: NamedValueSet[DatasetType]
920 """Dataset types that may be written after constructing the Tasks in this
921 Pipeline.
923 This does not include dataset types that are also used as inputs when
924 constructing other Tasks in the Pipeline (these are classified as
925 `initIntermediates`).
926 """
928 initIntermediates: NamedValueSet[DatasetType]
929 """Dataset types that are both used when constructing one or more Tasks
930 in the Pipeline and produced as a side-effect of constructing another
931 Task in the Pipeline.
932 """
934 inputs: NamedValueSet[DatasetType]
935 """Dataset types that are regular inputs for the full pipeline.
937 If an input dataset needed for a Quantum cannot be found in the input
938 collection(s), that Quantum (and all dependent Quanta) will not be
939 produced.
940 """
942 prerequisites: NamedValueSet[DatasetType]
943 """Dataset types that are prerequisite inputs for the full Pipeline.
945 Prerequisite inputs must exist in the input collection(s) before the
946 pipeline is run, but do not constrain the graph - if a prerequisite is
947 missing for a Quantum, `PrerequisiteMissingError` is raised.
949 Prerequisite inputs are not resolved until the second stage of
950 QuantumGraph generation.
951 """
953 intermediates: NamedValueSet[DatasetType]
954 """Dataset types that are output by one Task in the Pipeline and consumed
955 as inputs by one or more other Tasks in the Pipeline.
956 """
958 outputs: NamedValueSet[DatasetType]
959 """Dataset types that are output by a Task in the Pipeline and not consumed
960 by any other Task in the Pipeline.
961 """
963 byTask: Mapping[str, TaskDatasetTypes]
964 """Per-Task dataset types, keyed by label in the `Pipeline`.
966 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
967 neither has been modified since the dataset types were extracted, of
968 course).
969 """
971 @classmethod
972 def fromPipeline(
973 cls,
974 pipeline: Union[Pipeline, Iterable[TaskDef]],
975 *,
976 registry: Registry,
977 include_configs: bool = True,
978 include_packages: bool = True,
979 ) -> PipelineDatasetTypes:
980 """Extract and classify the dataset types from all tasks in a
981 `Pipeline`.
983 Parameters
984 ----------
985 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
986 A collection of tasks that can be run together.
987 registry: `Registry`
988 Registry used to construct normalized `DatasetType` objects and
989 retrieve those that are incomplete.
990 include_configs : `bool`, optional
991 If `True` (default) include config dataset types as
992 ``initOutputs``.
993 include_packages : `bool`, optional
994 If `True` (default) include the dataset type for software package
995 versions in ``initOutputs``.
997 Returns
998 -------
999 types: `PipelineDatasetTypes`
1000 The dataset types used by this `Pipeline`.
1002 Raises
1003 ------
1004 ValueError
1005 Raised if Tasks are inconsistent about which datasets are marked
1006 prerequisite. This indicates that the Tasks cannot be run as part
1007 of the same `Pipeline`.
1008 """
1009 allInputs = NamedValueSet()
1010 allOutputs = NamedValueSet()
1011 allInitInputs = NamedValueSet()
1012 allInitOutputs = NamedValueSet()
1013 prerequisites = NamedValueSet()
1014 byTask = dict()
1015 if include_packages:
1016 allInitOutputs.add(
1017 DatasetType(
1018 cls.packagesDatasetName,
1019 registry.dimensions.empty,
1020 storageClass="Packages",
1021 )
1022 )
1023 # create a list of TaskDefs in case the input is a generator
1024 pipeline = list(pipeline)
1026 # collect all the output dataset types
1027 typeStorageclassMap: Dict[str, str] = {}
1028 for taskDef in pipeline:
1029 for outConnection in iterConnections(taskDef.connections, 'outputs'):
1030 typeStorageclassMap[outConnection.name] = outConnection.storageClass
1032 for taskDef in pipeline:
1033 thisTask = TaskDatasetTypes.fromTaskDef(
1034 taskDef,
1035 registry=registry,
1036 include_configs=include_configs,
1037 storage_class_mapping=typeStorageclassMap
1038 )
1039 allInitInputs |= thisTask.initInputs
1040 allInitOutputs |= thisTask.initOutputs
1041 allInputs |= thisTask.inputs
1042 prerequisites |= thisTask.prerequisites
1043 allOutputs |= thisTask.outputs
1044 byTask[taskDef.label] = thisTask
1045 if not prerequisites.isdisjoint(allInputs):
1046 raise ValueError("{} marked as both prerequisites and regular inputs".format(
1047 {dt.name for dt in allInputs & prerequisites}
1048 ))
1049 if not prerequisites.isdisjoint(allOutputs):
1050 raise ValueError("{} marked as both prerequisites and outputs".format(
1051 {dt.name for dt in allOutputs & prerequisites}
1052 ))
1053 # Make sure that components which are marked as inputs get treated as
1054 # intermediates if there is an output which produces the composite
1055 # containing the component
1056 intermediateComponents = NamedValueSet()
1057 intermediateComposites = NamedValueSet()
1058 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
1059 for dsType in allInputs:
1060 # get the name of a possible component
1061 name, component = dsType.nameAndComponent()
1062 # if there is a component name, that means this is a component
1063 # DatasetType, if there is an output which produces the parent of
1064 # this component, treat this input as an intermediate
1065 if component is not None:
1066 # This needs to be in this if block, because someone might have
1067 # a composite that is a pure input from existing data
1068 if name in outputNameMapping:
1069 intermediateComponents.add(dsType)
1070 intermediateComposites.add(outputNameMapping[name])
1072 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
1073 common = a.names & b.names
1074 for name in common:
1075 if a[name] != b[name]:
1076 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
1078 checkConsistency(allInitInputs, allInitOutputs)
1079 checkConsistency(allInputs, allOutputs)
1080 checkConsistency(allInputs, intermediateComposites)
1081 checkConsistency(allOutputs, intermediateComposites)
1083 def frozen(s: NamedValueSet) -> NamedValueSet:
1084 s.freeze()
1085 return s
1087 return cls(
1088 initInputs=frozen(allInitInputs - allInitOutputs),
1089 initIntermediates=frozen(allInitInputs & allInitOutputs),
1090 initOutputs=frozen(allInitOutputs - allInitInputs),
1091 inputs=frozen(allInputs - allOutputs - intermediateComponents),
1092 intermediates=frozen(allInputs & allOutputs | intermediateComponents),
1093 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1094 prerequisites=frozen(prerequisites),
1095 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1096 )
1098 @classmethod
1099 def initOutputNames(cls, pipeline: Union[Pipeline, Iterable[TaskDef]], *,
1100 include_configs: bool = True, include_packages: bool = True) -> Iterator[str]:
1101 """Return the names of dataset types ot task initOutputs, Configs,
1102 and package versions for a pipeline.
1104 Parameters
1105 ----------
1106 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1107 A `Pipeline` instance or collection of `TaskDef` instances.
1108 include_configs : `bool`, optional
1109 If `True` (default) include config dataset types.
1110 include_packages : `bool`, optional
1111 If `True` (default) include the dataset type for package versions.
1113 Yields
1114 ------
1115 datasetTypeName : `str`
1116 Name of the dataset type.
1117 """
1118 if include_packages:
1119 # Package versions dataset type
1120 yield cls.packagesDatasetName
1122 if isinstance(pipeline, Pipeline):
1123 pipeline = pipeline.toExpandedPipeline()
1125 for taskDef in pipeline:
1127 # all task InitOutputs
1128 for name in taskDef.connections.initOutputs:
1129 attribute = getattr(taskDef.connections, name)
1130 yield attribute.name
1132 # config dataset name
1133 if include_configs:
1134 yield taskDef.configDatasetName