Coverage for python/lsst/pipe/base/pipeline.py: 18%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining Pipeline class and related methods.
24"""
26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
28import copy
29import logging
30import os
31import re
32import urllib.parse
33import warnings
35# -------------------------------
36# Imports of standard modules --
37# -------------------------------
38from dataclasses import dataclass
39from types import MappingProxyType
40from typing import (
41 TYPE_CHECKING,
42 ClassVar,
43 Dict,
44 Generator,
45 Iterable,
46 Iterator,
47 Mapping,
48 Optional,
49 Set,
50 Tuple,
51 Union,
52)
54# -----------------------------
55# Imports for other modules --
56from lsst.daf.butler import ButlerURI, DatasetType, NamedValueSet, Registry, SkyPixDimension
57from lsst.utils import doImport
59from . import pipelineIR, pipeTools
60from ._task_metadata import TaskMetadata
61from .configOverrides import ConfigOverrides
62from .connections import iterConnections
63from .pipelineTask import PipelineTask
64from .task import _TASK_METADATA_TYPE
66if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 66 ↛ 67line 66 didn't jump to line 67, because the condition on line 66 was never true
67 from lsst.obs.base import Instrument
69# ----------------------------------
70# Local non-exported definitions --
71# ----------------------------------
73_LOG = logging.getLogger(__name__)
75# ------------------------
76# Exported definitions --
77# ------------------------
80@dataclass
81class LabelSpecifier:
82 """A structure to specify a subset of labels to load
84 This structure may contain a set of labels to be used in subsetting a
85 pipeline, or a beginning and end point. Beginning or end may be empty,
86 in which case the range will be a half open interval. Unlike python
87 iteration bounds, end bounds are *INCLUDED*. Note that range based
88 selection is not well defined for pipelines that are not linear in nature,
89 and correct behavior is not guaranteed, or may vary from run to run.
90 """
92 labels: Optional[Set[str]] = None
93 begin: Optional[str] = None
94 end: Optional[str] = None
96 def __post_init__(self):
97 if self.labels is not None and (self.begin or self.end):
98 raise ValueError(
99 "This struct can only be initialized with a labels set or a begin (and/or) end specifier"
100 )
103class TaskDef:
104 """TaskDef is a collection of information about task needed by Pipeline.
106 The information includes task name, configuration object and optional
107 task class. This class is just a collection of attributes and it exposes
108 all of them so that attributes could potentially be modified in place
109 (e.g. if configuration needs extra overrides).
111 Attributes
112 ----------
113 taskName : `str`, optional
114 `PipelineTask` class name, currently it is not specified whether this
115 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
116 Framework should be prepared to handle all cases. If not provided,
117 ``taskClass`` must be, and ``taskClass.__name__`` is used.
118 config : `lsst.pex.config.Config`, optional
119 Instance of the configuration class corresponding to this task class,
120 usually with all overrides applied. This config will be frozen. If
121 not provided, ``taskClass`` must be provided and
122 ``taskClass.ConfigClass()`` will be used.
123 taskClass : `type`, optional
124 `PipelineTask` class object, can be ``None``. If ``None`` then
125 framework will have to locate and load class.
126 label : `str`, optional
127 Task label, usually a short string unique in a pipeline. If not
128 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will
129 be used.
130 """
132 def __init__(self, taskName=None, config=None, taskClass=None, label=None):
133 if taskName is None:
134 if taskClass is None:
135 raise ValueError("At least one of `taskName` and `taskClass` must be provided.")
136 taskName = taskClass.__name__
137 if config is None:
138 if taskClass is None:
139 raise ValueError("`taskClass` must be provided if `config` is not.")
140 config = taskClass.ConfigClass()
141 if label is None:
142 if taskClass is None:
143 raise ValueError("`taskClass` must be provided if `label` is not.")
144 label = taskClass._DefaultName
145 self.taskName = taskName
146 try:
147 config.validate()
148 except Exception:
149 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
150 raise
151 config.freeze()
152 self.config = config
153 self.taskClass = taskClass
154 self.label = label
155 self.connections = config.connections.ConnectionsClass(config=config)
157 @property
158 def configDatasetName(self) -> str:
159 """Name of a dataset type for configuration of this task (`str`)"""
160 return self.label + "_config"
162 @property
163 def metadataDatasetName(self) -> Optional[str]:
164 """Name of a dataset type for metadata of this task, `None` if
165 metadata is not to be saved (`str`)
166 """
167 if self.config.saveMetadata:
168 return self.label + "_metadata"
169 else:
170 return None
172 @property
173 def logOutputDatasetName(self) -> Optional[str]:
174 """Name of a dataset type for log output from this task, `None` if
175 logs are not to be saved (`str`)
176 """
177 if self.config.saveLogOutput:
178 return self.label + "_log"
179 else:
180 return None
182 def __str__(self):
183 rep = "TaskDef(" + self.taskName
184 if self.label:
185 rep += ", label=" + self.label
186 rep += ")"
187 return rep
189 def __eq__(self, other: object) -> bool:
190 if not isinstance(other, TaskDef):
191 return False
192 # This does not consider equality of configs when determining equality
193 # as config equality is a difficult thing to define. Should be updated
194 # after DM-27847
195 return self.taskClass == other.taskClass and self.label == other.label
197 def __hash__(self):
198 return hash((self.taskClass, self.label))
201class Pipeline:
202 """A `Pipeline` is a representation of a series of tasks to run, and the
203 configuration for those tasks.
205 Parameters
206 ----------
207 description : `str`
208 A description of that this pipeline does.
209 """
211 def __init__(self, description: str):
212 pipeline_dict = {"description": description, "tasks": {}}
213 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
215 @classmethod
216 def fromFile(cls, filename: str) -> Pipeline:
217 """Load a pipeline defined in a pipeline yaml file.
219 Parameters
220 ----------
221 filename: `str`
222 A path that points to a pipeline defined in yaml format. This
223 filename may also supply additional labels to be used in
224 subsetting the loaded Pipeline. These labels are separated from
225 the path by a \\#, and may be specified as a comma separated
226 list, or a range denoted as beginning..end. Beginning or end may
227 be empty, in which case the range will be a half open interval.
228 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
229 that range based selection is not well defined for pipelines that
230 are not linear in nature, and correct behavior is not guaranteed,
231 or may vary from run to run.
233 Returns
234 -------
235 pipeline: `Pipeline`
236 The pipeline loaded from specified location with appropriate (if
237 any) subsetting
239 Notes
240 -----
241 This method attempts to prune any contracts that contain labels which
242 are not in the declared subset of labels. This pruning is done using a
243 string based matching due to the nature of contracts and may prune more
244 than it should.
245 """
246 return cls.from_uri(filename)
248 @classmethod
249 def from_uri(cls, uri: Union[str, ButlerURI]) -> Pipeline:
250 """Load a pipeline defined in a pipeline yaml file at a location
251 specified by a URI.
253 Parameters
254 ----------
255 uri: `str` or `ButlerURI`
256 If a string is supplied this should be a URI path that points to a
257 pipeline defined in yaml format. This uri may also supply
258 additional labels to be used in subsetting the loaded Pipeline.
259 These labels are separated from the path by a \\#, and may be
260 specified as a comma separated list, or a range denoted as
261 beginning..end. Beginning or end may be empty, in which case the
262 range will be a half open interval. Unlike python iteration
263 bounds, end bounds are *INCLUDED*. Note that range based selection
264 is not well defined for pipelines that are not linear in nature,
265 and correct behavior is not guaranteed, or may vary from run to
266 run. The same specifiers can be used with a ButlerURI object, by
267 being the sole contents in the fragments attribute.
269 Returns
270 -------
271 pipeline: `Pipeline`
272 The pipeline loaded from specified location with appropriate (if
273 any) subsetting
275 Notes
276 -----
277 This method attempts to prune any contracts that contain labels which
278 are not in the declared subset of labels. This pruning is done using a
279 string based matching due to the nature of contracts and may prune more
280 than it should.
281 """
282 # Split up the uri and any labels that were supplied
283 uri, label_specifier = cls._parse_file_specifier(uri)
284 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
286 # If there are labels supplied, only keep those
287 if label_specifier is not None:
288 pipeline = pipeline.subsetFromLabels(label_specifier)
289 return pipeline
291 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
292 """Subset a pipeline to contain only labels specified in labelSpecifier
294 Parameters
295 ----------
296 labelSpecifier : `labelSpecifier`
297 Object containing labels that describes how to subset a pipeline.
299 Returns
300 -------
301 pipeline : `Pipeline`
302 A new pipeline object that is a subset of the old pipeline
304 Raises
305 ------
306 ValueError
307 Raised if there is an issue with specified labels
309 Notes
310 -----
311 This method attempts to prune any contracts that contain labels which
312 are not in the declared subset of labels. This pruning is done using a
313 string based matching due to the nature of contracts and may prune more
314 than it should.
315 """
316 # Labels supplied as a set
317 if labelSpecifier.labels:
318 labelSet = labelSpecifier.labels
319 # Labels supplied as a range, first create a list of all the labels
320 # in the pipeline sorted according to task dependency. Then only
321 # keep labels that lie between the supplied bounds
322 else:
323 # Create a copy of the pipeline to use when assessing the label
324 # ordering. Use a dict for fast searching while preserving order.
325 # Remove contracts so they do not fail in the expansion step. This
326 # is needed because a user may only configure the tasks they intend
327 # to run, which may cause some contracts to fail if they will later
328 # be dropped
329 pipeline = copy.deepcopy(self)
330 pipeline._pipelineIR.contracts = []
331 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
333 # Verify the bounds are in the labels
334 if labelSpecifier.begin is not None:
335 if labelSpecifier.begin not in labels:
336 raise ValueError(
337 f"Beginning of range subset, {labelSpecifier.begin}, not found in "
338 "pipeline definition"
339 )
340 if labelSpecifier.end is not None:
341 if labelSpecifier.end not in labels:
342 raise ValueError(
343 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition"
344 )
346 labelSet = set()
347 for label in labels:
348 if labelSpecifier.begin is not None:
349 if label != labelSpecifier.begin:
350 continue
351 else:
352 labelSpecifier.begin = None
353 labelSet.add(label)
354 if labelSpecifier.end is not None and label == labelSpecifier.end:
355 break
356 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
358 @staticmethod
359 def _parse_file_specifier(uri: Union[str, ButlerURI]) -> Tuple[ButlerURI, Optional[LabelSpecifier]]:
360 """Split appart a uri and any possible label subsets"""
361 if isinstance(uri, str):
362 # This is to support legacy pipelines during transition
363 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
364 if num_replace:
365 warnings.warn(
366 f"The pipeline file {uri} seems to use the legacy : to separate "
367 "labels, this is deprecated and will be removed after June 2021, please use "
368 "# instead.",
369 category=FutureWarning,
370 )
371 if uri.count("#") > 1:
372 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
373 uri = ButlerURI(uri)
374 label_subset = uri.fragment or None
376 specifier: Optional[LabelSpecifier]
377 if label_subset is not None:
378 label_subset = urllib.parse.unquote(label_subset)
379 args: Dict[str, Union[Set[str], str, None]]
380 # labels supplied as a list
381 if "," in label_subset:
382 if ".." in label_subset:
383 raise ValueError(
384 "Can only specify a list of labels or a rangewhen loading a Pipline not both"
385 )
386 args = {"labels": set(label_subset.split(","))}
387 # labels supplied as a range
388 elif ".." in label_subset:
389 # Try to de-structure the labelSubset, this will fail if more
390 # than one range is specified
391 begin, end, *rest = label_subset.split("..")
392 if rest:
393 raise ValueError("Only one range can be specified when loading a pipeline")
394 args = {"begin": begin if begin else None, "end": end if end else None}
395 # Assume anything else is a single label
396 else:
397 args = {"labels": {label_subset}}
399 specifier = LabelSpecifier(**args)
400 else:
401 specifier = None
403 return uri, specifier
405 @classmethod
406 def fromString(cls, pipeline_string: str) -> Pipeline:
407 """Create a pipeline from string formatted as a pipeline document.
409 Parameters
410 ----------
411 pipeline_string : `str`
412 A string that is formatted according like a pipeline document
414 Returns
415 -------
416 pipeline: `Pipeline`
417 """
418 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
419 return pipeline
421 @classmethod
422 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
423 """Create a pipeline from an already created `PipelineIR` object.
425 Parameters
426 ----------
427 deserialized_pipeline: `PipelineIR`
428 An already created pipeline intermediate representation object
430 Returns
431 -------
432 pipeline: `Pipeline`
433 """
434 pipeline = cls.__new__(cls)
435 pipeline._pipelineIR = deserialized_pipeline
436 return pipeline
438 @classmethod
439 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline:
440 """Create a new pipeline by copying an already existing `Pipeline`.
442 Parameters
443 ----------
444 pipeline: `Pipeline`
445 An already created pipeline intermediate representation object
447 Returns
448 -------
449 pipeline: `Pipeline`
450 """
451 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
453 def __str__(self) -> str:
454 # tasks need sorted each call because someone might have added or
455 # removed task, and caching changes does not seem worth the small
456 # overhead
457 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
458 self._pipelineIR.reorder_tasks(labels)
459 return str(self._pipelineIR)
461 def addInstrument(self, instrument: Union[Instrument, str]) -> None:
462 """Add an instrument to the pipeline, or replace an instrument that is
463 already defined.
465 Parameters
466 ----------
467 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
468 Either a derived class object of a `lsst.daf.butler.instrument` or
469 a string corresponding to a fully qualified
470 `lsst.daf.butler.instrument` name.
471 """
472 if isinstance(instrument, str):
473 pass
474 else:
475 # TODO: assume that this is a subclass of Instrument, no type
476 # checking
477 instrument = f"{instrument.__module__}.{instrument.__qualname__}"
478 self._pipelineIR.instrument = instrument
480 def getInstrument(self) -> Instrument:
481 """Get the instrument from the pipeline.
483 Returns
484 -------
485 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None
486 A derived class object of a `lsst.daf.butler.instrument`, a string
487 corresponding to a fully qualified `lsst.daf.butler.instrument`
488 name, or None if the pipeline does not have an instrument.
489 """
490 return self._pipelineIR.instrument
492 def addTask(self, task: Union[PipelineTask, str], label: str) -> None:
493 """Add a new task to the pipeline, or replace a task that is already
494 associated with the supplied label.
496 Parameters
497 ----------
498 task: `PipelineTask` or `str`
499 Either a derived class object of a `PipelineTask` or a string
500 corresponding to a fully qualified `PipelineTask` name.
501 label: `str`
502 A label that is used to identify the `PipelineTask` being added
503 """
504 if isinstance(task, str):
505 taskName = task
506 elif issubclass(task, PipelineTask):
507 taskName = f"{task.__module__}.{task.__qualname__}"
508 else:
509 raise ValueError(
510 "task must be either a child class of PipelineTask or a string containing"
511 " a fully qualified name to one"
512 )
513 if not label:
514 # in some cases (with command line-generated pipeline) tasks can
515 # be defined without label which is not acceptable, use task
516 # _DefaultName in that case
517 if isinstance(task, str):
518 task = doImport(task)
519 label = task._DefaultName
520 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
522 def removeTask(self, label: str) -> None:
523 """Remove a task from the pipeline.
525 Parameters
526 ----------
527 label : `str`
528 The label used to identify the task that is to be removed
530 Raises
531 ------
532 KeyError
533 If no task with that label exists in the pipeline
535 """
536 self._pipelineIR.tasks.pop(label)
538 def addConfigOverride(self, label: str, key: str, value: object) -> None:
539 """Apply single config override.
541 Parameters
542 ----------
543 label : `str`
544 Label of the task.
545 key: `str`
546 Fully-qualified field name.
547 value : object
548 Value to be given to a field.
549 """
550 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
552 def addConfigFile(self, label: str, filename: str) -> None:
553 """Add overrides from a specified file.
555 Parameters
556 ----------
557 label : `str`
558 The label used to identify the task associated with config to
559 modify
560 filename : `str`
561 Path to the override file.
562 """
563 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
565 def addConfigPython(self, label: str, pythonString: str) -> None:
566 """Add Overrides by running a snippet of python code against a config.
568 Parameters
569 ----------
570 label : `str`
571 The label used to identity the task associated with config to
572 modify.
573 pythonString: `str`
574 A string which is valid python code to be executed. This is done
575 with config as the only local accessible value.
576 """
577 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
579 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
580 if label == "parameters":
581 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys():
582 raise ValueError("Cannot override parameters that are not defined in pipeline")
583 self._pipelineIR.parameters.mapping.update(newConfig.rest)
584 if newConfig.file:
585 raise ValueError("Setting parameters section with config file is not supported")
586 if newConfig.python:
587 raise ValueError("Setting parameters section using python block in unsupported")
588 return
589 if label not in self._pipelineIR.tasks:
590 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
591 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
593 def toFile(self, filename: str) -> None:
594 self._pipelineIR.to_file(filename)
596 def write_to_uri(self, uri: Union[str, ButlerURI]) -> None:
597 # tasks need sorted each call because someone might have added or
598 # removed task, and caching changes does not seem worth the small
599 # overhead
600 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
601 self._pipelineIR.reorder_tasks(labels)
602 self._pipelineIR.write_to_uri(uri)
604 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
605 """Returns a generator of TaskDefs which can be used to create quantum
606 graphs.
608 Returns
609 -------
610 generator : generator of `TaskDef`
611 The generator returned will be the sorted iterator of tasks which
612 are to be used in constructing a quantum graph.
614 Raises
615 ------
616 NotImplementedError
617 If a dataId is supplied in a config block. This is in place for
618 future use
619 """
620 yield from self._toExpandedPipelineImpl()
622 def _toExpandedPipelineImpl(self, checkContracts=True) -> Iterable[TaskDef]:
623 taskDefs = []
624 for label in self._pipelineIR.tasks:
625 taskDefs.append(self._buildTaskDef(label))
627 # lets evaluate the contracts
628 if self._pipelineIR.contracts is not None:
629 label_to_config = {x.label: x.config for x in taskDefs}
630 for contract in self._pipelineIR.contracts:
631 # execute this in its own line so it can raise a good error
632 # message if there was problems with the eval
633 success = eval(contract.contract, None, label_to_config)
634 if not success:
635 extra_info = f": {contract.msg}" if contract.msg is not None else ""
636 raise pipelineIR.ContractError(
637 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}"
638 )
640 taskDefs = sorted(taskDefs, key=lambda x: x.label)
641 yield from pipeTools.orderPipeline(taskDefs)
643 def _buildTaskDef(self, label: str) -> TaskDef:
644 if (taskIR := self._pipelineIR.tasks.get(label)) is None:
645 raise NameError(f"Label {label} does not appear in this pipeline")
646 taskClass = doImport(taskIR.klass)
647 taskName = taskClass.__qualname__
648 config = taskClass.ConfigClass()
649 overrides = ConfigOverrides()
650 if self._pipelineIR.instrument is not None:
651 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName)
652 if taskIR.config is not None:
653 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config):
654 if configIR.dataId is not None:
655 raise NotImplementedError(
656 "Specializing a config on a partial data id is not yet "
657 "supported in Pipeline definition"
658 )
659 # only apply override if it applies to everything
660 if configIR.dataId is None:
661 if configIR.file:
662 for configFile in configIR.file:
663 overrides.addFileOverride(os.path.expandvars(configFile))
664 if configIR.python is not None:
665 overrides.addPythonOverride(configIR.python)
666 for key, value in configIR.rest.items():
667 overrides.addValueOverride(key, value)
668 overrides.applyTo(config)
669 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)
671 def __iter__(self) -> Generator[TaskDef, None, None]:
672 return self.toExpandedPipeline()
674 def __getitem__(self, item: str) -> TaskDef:
675 return self._buildTaskDef(item)
677 def __len__(self):
678 return len(self._pipelineIR.tasks)
680 def __eq__(self, other: object):
681 if not isinstance(other, Pipeline):
682 return False
683 return self._pipelineIR == other._pipelineIR
686@dataclass(frozen=True)
687class TaskDatasetTypes:
688 """An immutable struct that extracts and classifies the dataset types used
689 by a `PipelineTask`
690 """
692 initInputs: NamedValueSet[DatasetType]
693 """Dataset types that are needed as inputs in order to construct this Task.
695 Task-level `initInputs` may be classified as either
696 `~PipelineDatasetTypes.initInputs` or
697 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
698 """
700 initOutputs: NamedValueSet[DatasetType]
701 """Dataset types that may be written after constructing this Task.
703 Task-level `initOutputs` may be classified as either
704 `~PipelineDatasetTypes.initOutputs` or
705 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
706 """
708 inputs: NamedValueSet[DatasetType]
709 """Dataset types that are regular inputs to this Task.
711 If an input dataset needed for a Quantum cannot be found in the input
712 collection(s) or produced by another Task in the Pipeline, that Quantum
713 (and all dependent Quanta) will not be produced.
715 Task-level `inputs` may be classified as either
716 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
717 at the Pipeline level.
718 """
720 prerequisites: NamedValueSet[DatasetType]
721 """Dataset types that are prerequisite inputs to this Task.
723 Prerequisite inputs must exist in the input collection(s) before the
724 pipeline is run, but do not constrain the graph - if a prerequisite is
725 missing for a Quantum, `PrerequisiteMissingError` is raised.
727 Prerequisite inputs are not resolved until the second stage of
728 QuantumGraph generation.
729 """
731 outputs: NamedValueSet[DatasetType]
732 """Dataset types that are produced by this Task.
734 Task-level `outputs` may be classified as either
735 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
736 at the Pipeline level.
737 """
739 @classmethod
740 def fromTaskDef(
741 cls,
742 taskDef: TaskDef,
743 *,
744 registry: Registry,
745 include_configs: bool = True,
746 storage_class_mapping: Optional[Mapping[str, str]] = None,
747 ) -> TaskDatasetTypes:
748 """Extract and classify the dataset types from a single `PipelineTask`.
750 Parameters
751 ----------
752 taskDef: `TaskDef`
753 An instance of a `TaskDef` class for a particular `PipelineTask`.
754 registry: `Registry`
755 Registry used to construct normalized `DatasetType` objects and
756 retrieve those that are incomplete.
757 include_configs : `bool`, optional
758 If `True` (default) include config dataset types as
759 ``initOutputs``.
760 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional
761 If a taskdef contains a component dataset type that is unknown
762 to the registry, its parent StorageClass will be looked up in this
763 mapping if it is supplied. If the mapping does not contain the
764 composite dataset type, or the mapping is not supplied an exception
765 will be raised.
767 Returns
768 -------
769 types: `TaskDatasetTypes`
770 The dataset types used by this task.
772 Raises
773 ------
774 ValueError
775 Raised if dataset type connection definition differs from
776 registry definition.
777 LookupError
778 Raised if component parent StorageClass could not be determined
779 and storage_class_mapping does not contain the composite type, or
780 is set to None.
781 """
783 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]:
784 """Constructs a set of true `DatasetType` objects
786 Parameters
787 ----------
788 connectionType : `str`
789 Name of the connection type to produce a set for, corresponds
790 to an attribute of type `list` on the connection class instance
791 freeze : `bool`, optional
792 If `True`, call `NamedValueSet.freeze` on the object returned.
794 Returns
795 -------
796 datasetTypes : `NamedValueSet`
797 A set of all datasetTypes which correspond to the input
798 connection type specified in the connection class of this
799 `PipelineTask`
801 Raises
802 ------
803 ValueError
804 Raised if dataset type connection definition differs from
805 registry definition.
806 LookupError
807 Raised if component parent StorageClass could not be determined
808 and storage_class_mapping does not contain the composite type,
809 or is set to None.
811 Notes
812 -----
813 This function is a closure over the variables ``registry`` and
814 ``taskDef``, and ``storage_class_mapping``.
815 """
816 datasetTypes = NamedValueSet()
817 for c in iterConnections(taskDef.connections, connectionType):
818 dimensions = set(getattr(c, "dimensions", set()))
819 if "skypix" in dimensions:
820 try:
821 datasetType = registry.getDatasetType(c.name)
822 except LookupError as err:
823 raise LookupError(
824 f"DatasetType '{c.name}' referenced by "
825 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
826 f"placeholder, but does not already exist in the registry. "
827 f"Note that reference catalog names are now used as the dataset "
828 f"type name instead of 'ref_cat'."
829 ) from err
830 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
831 rest2 = set(
832 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension)
833 )
834 if rest1 != rest2:
835 raise ValueError(
836 f"Non-skypix dimensions for dataset type {c.name} declared in "
837 f"connections ({rest1}) are inconsistent with those in "
838 f"registry's version of this dataset ({rest2})."
839 )
840 else:
841 # Component dataset types are not explicitly in the
842 # registry. This complicates consistency checks with
843 # registry and requires we work out the composite storage
844 # class.
845 registryDatasetType = None
846 try:
847 registryDatasetType = registry.getDatasetType(c.name)
848 except KeyError:
849 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
850 if componentName:
851 if storage_class_mapping is None or compositeName not in storage_class_mapping:
852 raise LookupError(
853 "Component parent class cannot be determined, and "
854 "composite name was not in storage class mapping, or no "
855 "storage_class_mapping was supplied"
856 )
857 else:
858 parentStorageClass = storage_class_mapping[compositeName]
859 else:
860 parentStorageClass = None
861 datasetType = c.makeDatasetType(
862 registry.dimensions, parentStorageClass=parentStorageClass
863 )
864 registryDatasetType = datasetType
865 else:
866 datasetType = c.makeDatasetType(
867 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass
868 )
870 if registryDatasetType and datasetType != registryDatasetType:
871 try:
872 # Explicitly check for storage class just to make
873 # more specific message.
874 _ = datasetType.storageClass
875 except KeyError:
876 raise ValueError(
877 "Storage class does not exist for supplied dataset type "
878 f"{datasetType} for {taskDef.label}."
879 ) from None
880 raise ValueError(
881 f"Supplied dataset type ({datasetType}) inconsistent with "
882 f"registry definition ({registryDatasetType}) "
883 f"for {taskDef.label}."
884 )
885 datasetTypes.add(datasetType)
886 if freeze:
887 datasetTypes.freeze()
888 return datasetTypes
890 # optionally add initOutput dataset for config
891 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False)
892 if include_configs:
893 initOutputs.add(
894 DatasetType(
895 taskDef.configDatasetName,
896 registry.dimensions.empty,
897 storageClass="Config",
898 )
899 )
900 initOutputs.freeze()
902 # optionally add output dataset for metadata
903 outputs = makeDatasetTypesSet("outputs", freeze=False)
904 if taskDef.metadataDatasetName is not None:
905 # Metadata is supposed to be of the TaskMetadata type, its
906 # dimensions correspond to a task quantum.
907 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
909 # Allow the storage class definition to be read from the existing
910 # dataset type definition if present.
911 try:
912 current = registry.getDatasetType(taskDef.metadataDatasetName)
913 except KeyError:
914 # No previous definition so use the default.
915 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet"
916 else:
917 storageClass = current.storageClass.name
919 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}
920 if taskDef.logOutputDatasetName is not None:
921 # Log output dimensions correspond to a task quantum.
922 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
923 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}
925 outputs.freeze()
927 return cls(
928 initInputs=makeDatasetTypesSet("initInputs"),
929 initOutputs=initOutputs,
930 inputs=makeDatasetTypesSet("inputs"),
931 prerequisites=makeDatasetTypesSet("prerequisiteInputs"),
932 outputs=outputs,
933 )
936@dataclass(frozen=True)
937class PipelineDatasetTypes:
938 """An immutable struct that classifies the dataset types used in a
939 `Pipeline`.
940 """
942 packagesDatasetName: ClassVar[str] = "packages"
943 """Name of a dataset type used to save package versions.
944 """
946 initInputs: NamedValueSet[DatasetType]
947 """Dataset types that are needed as inputs in order to construct the Tasks
948 in this Pipeline.
950 This does not include dataset types that are produced when constructing
951 other Tasks in the Pipeline (these are classified as `initIntermediates`).
952 """
954 initOutputs: NamedValueSet[DatasetType]
955 """Dataset types that may be written after constructing the Tasks in this
956 Pipeline.
958 This does not include dataset types that are also used as inputs when
959 constructing other Tasks in the Pipeline (these are classified as
960 `initIntermediates`).
961 """
963 initIntermediates: NamedValueSet[DatasetType]
964 """Dataset types that are both used when constructing one or more Tasks
965 in the Pipeline and produced as a side-effect of constructing another
966 Task in the Pipeline.
967 """
969 inputs: NamedValueSet[DatasetType]
970 """Dataset types that are regular inputs for the full pipeline.
972 If an input dataset needed for a Quantum cannot be found in the input
973 collection(s), that Quantum (and all dependent Quanta) will not be
974 produced.
975 """
977 prerequisites: NamedValueSet[DatasetType]
978 """Dataset types that are prerequisite inputs for the full Pipeline.
980 Prerequisite inputs must exist in the input collection(s) before the
981 pipeline is run, but do not constrain the graph - if a prerequisite is
982 missing for a Quantum, `PrerequisiteMissingError` is raised.
984 Prerequisite inputs are not resolved until the second stage of
985 QuantumGraph generation.
986 """
988 intermediates: NamedValueSet[DatasetType]
989 """Dataset types that are output by one Task in the Pipeline and consumed
990 as inputs by one or more other Tasks in the Pipeline.
991 """
993 outputs: NamedValueSet[DatasetType]
994 """Dataset types that are output by a Task in the Pipeline and not consumed
995 by any other Task in the Pipeline.
996 """
998 byTask: Mapping[str, TaskDatasetTypes]
999 """Per-Task dataset types, keyed by label in the `Pipeline`.
1001 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
1002 neither has been modified since the dataset types were extracted, of
1003 course).
1004 """
1006 @classmethod
1007 def fromPipeline(
1008 cls,
1009 pipeline: Union[Pipeline, Iterable[TaskDef]],
1010 *,
1011 registry: Registry,
1012 include_configs: bool = True,
1013 include_packages: bool = True,
1014 ) -> PipelineDatasetTypes:
1015 """Extract and classify the dataset types from all tasks in a
1016 `Pipeline`.
1018 Parameters
1019 ----------
1020 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1021 A collection of tasks that can be run together.
1022 registry: `Registry`
1023 Registry used to construct normalized `DatasetType` objects and
1024 retrieve those that are incomplete.
1025 include_configs : `bool`, optional
1026 If `True` (default) include config dataset types as
1027 ``initOutputs``.
1028 include_packages : `bool`, optional
1029 If `True` (default) include the dataset type for software package
1030 versions in ``initOutputs``.
1032 Returns
1033 -------
1034 types: `PipelineDatasetTypes`
1035 The dataset types used by this `Pipeline`.
1037 Raises
1038 ------
1039 ValueError
1040 Raised if Tasks are inconsistent about which datasets are marked
1041 prerequisite. This indicates that the Tasks cannot be run as part
1042 of the same `Pipeline`.
1043 """
1044 allInputs = NamedValueSet()
1045 allOutputs = NamedValueSet()
1046 allInitInputs = NamedValueSet()
1047 allInitOutputs = NamedValueSet()
1048 prerequisites = NamedValueSet()
1049 byTask = dict()
1050 if include_packages:
1051 allInitOutputs.add(
1052 DatasetType(
1053 cls.packagesDatasetName,
1054 registry.dimensions.empty,
1055 storageClass="Packages",
1056 )
1057 )
1058 # create a list of TaskDefs in case the input is a generator
1059 pipeline = list(pipeline)
1061 # collect all the output dataset types
1062 typeStorageclassMap: Dict[str, str] = {}
1063 for taskDef in pipeline:
1064 for outConnection in iterConnections(taskDef.connections, "outputs"):
1065 typeStorageclassMap[outConnection.name] = outConnection.storageClass
1067 for taskDef in pipeline:
1068 thisTask = TaskDatasetTypes.fromTaskDef(
1069 taskDef,
1070 registry=registry,
1071 include_configs=include_configs,
1072 storage_class_mapping=typeStorageclassMap,
1073 )
1074 allInitInputs |= thisTask.initInputs
1075 allInitOutputs |= thisTask.initOutputs
1076 allInputs |= thisTask.inputs
1077 prerequisites |= thisTask.prerequisites
1078 allOutputs |= thisTask.outputs
1079 byTask[taskDef.label] = thisTask
1080 if not prerequisites.isdisjoint(allInputs):
1081 raise ValueError(
1082 "{} marked as both prerequisites and regular inputs".format(
1083 {dt.name for dt in allInputs & prerequisites}
1084 )
1085 )
1086 if not prerequisites.isdisjoint(allOutputs):
1087 raise ValueError(
1088 "{} marked as both prerequisites and outputs".format(
1089 {dt.name for dt in allOutputs & prerequisites}
1090 )
1091 )
1092 # Make sure that components which are marked as inputs get treated as
1093 # intermediates if there is an output which produces the composite
1094 # containing the component
1095 intermediateComponents = NamedValueSet()
1096 intermediateComposites = NamedValueSet()
1097 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
1098 for dsType in allInputs:
1099 # get the name of a possible component
1100 name, component = dsType.nameAndComponent()
1101 # if there is a component name, that means this is a component
1102 # DatasetType, if there is an output which produces the parent of
1103 # this component, treat this input as an intermediate
1104 if component is not None:
1105 # This needs to be in this if block, because someone might have
1106 # a composite that is a pure input from existing data
1107 if name in outputNameMapping:
1108 intermediateComponents.add(dsType)
1109 intermediateComposites.add(outputNameMapping[name])
1111 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
1112 common = a.names & b.names
1113 for name in common:
1114 if a[name] != b[name]:
1115 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
1117 checkConsistency(allInitInputs, allInitOutputs)
1118 checkConsistency(allInputs, allOutputs)
1119 checkConsistency(allInputs, intermediateComposites)
1120 checkConsistency(allOutputs, intermediateComposites)
1122 def frozen(s: NamedValueSet) -> NamedValueSet:
1123 s.freeze()
1124 return s
1126 return cls(
1127 initInputs=frozen(allInitInputs - allInitOutputs),
1128 initIntermediates=frozen(allInitInputs & allInitOutputs),
1129 initOutputs=frozen(allInitOutputs - allInitInputs),
1130 inputs=frozen(allInputs - allOutputs - intermediateComponents),
1131 intermediates=frozen(allInputs & allOutputs | intermediateComponents),
1132 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1133 prerequisites=frozen(prerequisites),
1134 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1135 )
1137 @classmethod
1138 def initOutputNames(
1139 cls,
1140 pipeline: Union[Pipeline, Iterable[TaskDef]],
1141 *,
1142 include_configs: bool = True,
1143 include_packages: bool = True,
1144 ) -> Iterator[str]:
1145 """Return the names of dataset types ot task initOutputs, Configs,
1146 and package versions for a pipeline.
1148 Parameters
1149 ----------
1150 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1151 A `Pipeline` instance or collection of `TaskDef` instances.
1152 include_configs : `bool`, optional
1153 If `True` (default) include config dataset types.
1154 include_packages : `bool`, optional
1155 If `True` (default) include the dataset type for package versions.
1157 Yields
1158 ------
1159 datasetTypeName : `str`
1160 Name of the dataset type.
1161 """
1162 if include_packages:
1163 # Package versions dataset type
1164 yield cls.packagesDatasetName
1166 if isinstance(pipeline, Pipeline):
1167 pipeline = pipeline.toExpandedPipeline()
1169 for taskDef in pipeline:
1171 # all task InitOutputs
1172 for name in taskDef.connections.initOutputs:
1173 attribute = getattr(taskDef.connections, name)
1174 yield attribute.name
1176 # config dataset name
1177 if include_configs:
1178 yield taskDef.configDatasetName