Coverage for python/lsst/pipe/base/pipeline.py: 19%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining Pipeline class and related methods.
24"""
26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
28import copy
29import logging
30import os
31import re
32import urllib.parse
33import warnings
35# -------------------------------
36# Imports of standard modules --
37# -------------------------------
38from dataclasses import dataclass
39from types import MappingProxyType
40from typing import (
41 TYPE_CHECKING,
42 ClassVar,
43 Dict,
44 Generator,
45 Iterable,
46 Iterator,
47 Mapping,
48 Optional,
49 Set,
50 Tuple,
51 Union,
52)
54# -----------------------------
55# Imports for other modules --
56from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension
57from lsst.resources import ResourcePath, ResourcePathExpression
58from lsst.utils import doImport
59from lsst.utils.introspection import get_full_type_name
61from . import pipelineIR, pipeTools
62from ._task_metadata import TaskMetadata
63from .configOverrides import ConfigOverrides
64from .connections import iterConnections
65from .pipelineTask import PipelineTask
66from .task import _TASK_METADATA_TYPE
68if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 68 ↛ 69line 68 didn't jump to line 69, because the condition on line 68 was never true
69 from lsst.obs.base import Instrument
71# ----------------------------------
72# Local non-exported definitions --
73# ----------------------------------
75_LOG = logging.getLogger(__name__)
77# ------------------------
78# Exported definitions --
79# ------------------------
82@dataclass
83class LabelSpecifier:
84 """A structure to specify a subset of labels to load
86 This structure may contain a set of labels to be used in subsetting a
87 pipeline, or a beginning and end point. Beginning or end may be empty,
88 in which case the range will be a half open interval. Unlike python
89 iteration bounds, end bounds are *INCLUDED*. Note that range based
90 selection is not well defined for pipelines that are not linear in nature,
91 and correct behavior is not guaranteed, or may vary from run to run.
92 """
94 labels: Optional[Set[str]] = None
95 begin: Optional[str] = None
96 end: Optional[str] = None
98 def __post_init__(self):
99 if self.labels is not None and (self.begin or self.end):
100 raise ValueError(
101 "This struct can only be initialized with a labels set or a begin (and/or) end specifier"
102 )
105class TaskDef:
106 """TaskDef is a collection of information about task needed by Pipeline.
108 The information includes task name, configuration object and optional
109 task class. This class is just a collection of attributes and it exposes
110 all of them so that attributes could potentially be modified in place
111 (e.g. if configuration needs extra overrides).
113 Attributes
114 ----------
115 taskName : `str`, optional
116 `PipelineTask` class name, currently it is not specified whether this
117 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
118 Framework should be prepared to handle all cases. If not provided,
119 ``taskClass`` must be, and ``taskClass.__name__`` is used.
120 config : `lsst.pex.config.Config`, optional
121 Instance of the configuration class corresponding to this task class,
122 usually with all overrides applied. This config will be frozen. If
123 not provided, ``taskClass`` must be provided and
124 ``taskClass.ConfigClass()`` will be used.
125 taskClass : `type`, optional
126 `PipelineTask` class object, can be ``None``. If ``None`` then
127 framework will have to locate and load class.
128 label : `str`, optional
129 Task label, usually a short string unique in a pipeline. If not
130 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will
131 be used.
132 """
134 def __init__(self, taskName=None, config=None, taskClass=None, label=None):
135 if taskName is None:
136 if taskClass is None:
137 raise ValueError("At least one of `taskName` and `taskClass` must be provided.")
138 taskName = taskClass.__name__
139 if config is None:
140 if taskClass is None:
141 raise ValueError("`taskClass` must be provided if `config` is not.")
142 config = taskClass.ConfigClass()
143 if label is None:
144 if taskClass is None:
145 raise ValueError("`taskClass` must be provided if `label` is not.")
146 label = taskClass._DefaultName
147 self.taskName = taskName
148 try:
149 config.validate()
150 except Exception:
151 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
152 raise
153 config.freeze()
154 self.config = config
155 self.taskClass = taskClass
156 self.label = label
157 self.connections = config.connections.ConnectionsClass(config=config)
159 @property
160 def configDatasetName(self) -> str:
161 """Name of a dataset type for configuration of this task (`str`)"""
162 return self.label + "_config"
164 @property
165 def metadataDatasetName(self) -> Optional[str]:
166 """Name of a dataset type for metadata of this task, `None` if
167 metadata is not to be saved (`str`)
168 """
169 if self.config.saveMetadata:
170 return self.label + "_metadata"
171 else:
172 return None
174 @property
175 def logOutputDatasetName(self) -> Optional[str]:
176 """Name of a dataset type for log output from this task, `None` if
177 logs are not to be saved (`str`)
178 """
179 if self.config.saveLogOutput:
180 return self.label + "_log"
181 else:
182 return None
184 def __str__(self):
185 rep = "TaskDef(" + self.taskName
186 if self.label:
187 rep += ", label=" + self.label
188 rep += ")"
189 return rep
191 def __eq__(self, other: object) -> bool:
192 if not isinstance(other, TaskDef):
193 return False
194 # This does not consider equality of configs when determining equality
195 # as config equality is a difficult thing to define. Should be updated
196 # after DM-27847
197 return self.taskClass == other.taskClass and self.label == other.label
199 def __hash__(self):
200 return hash((self.taskClass, self.label))
203class Pipeline:
204 """A `Pipeline` is a representation of a series of tasks to run, and the
205 configuration for those tasks.
207 Parameters
208 ----------
209 description : `str`
210 A description of that this pipeline does.
211 """
213 def __init__(self, description: str):
214 pipeline_dict = {"description": description, "tasks": {}}
215 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
217 @classmethod
218 def fromFile(cls, filename: str) -> Pipeline:
219 """Load a pipeline defined in a pipeline yaml file.
221 Parameters
222 ----------
223 filename: `str`
224 A path that points to a pipeline defined in yaml format. This
225 filename may also supply additional labels to be used in
226 subsetting the loaded Pipeline. These labels are separated from
227 the path by a \\#, and may be specified as a comma separated
228 list, or a range denoted as beginning..end. Beginning or end may
229 be empty, in which case the range will be a half open interval.
230 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
231 that range based selection is not well defined for pipelines that
232 are not linear in nature, and correct behavior is not guaranteed,
233 or may vary from run to run.
235 Returns
236 -------
237 pipeline: `Pipeline`
238 The pipeline loaded from specified location with appropriate (if
239 any) subsetting
241 Notes
242 -----
243 This method attempts to prune any contracts that contain labels which
244 are not in the declared subset of labels. This pruning is done using a
245 string based matching due to the nature of contracts and may prune more
246 than it should.
247 """
248 return cls.from_uri(filename)
250 @classmethod
251 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline:
252 """Load a pipeline defined in a pipeline yaml file at a location
253 specified by a URI.
255 Parameters
256 ----------
257 uri: convertible to `ResourcePath`
258 If a string is supplied this should be a URI path that points to a
259 pipeline defined in yaml format, either as a direct path to the
260 yaml file, or as a directory containing a "pipeline.yaml" file (the
261 form used by `write_to_uri` with ``expand=True``). This uri may
262 also supply additional labels to be used in subsetting the loaded
263 Pipeline. These labels are separated from the path by a \\#, and
264 may be specified as a comma separated list, or a range denoted as
265 beginning..end. Beginning or end may be empty, in which case the
266 range will be a half open interval. Unlike python iteration bounds,
267 end bounds are *INCLUDED*. Note that range based selection is not
268 well defined for pipelines that are not linear in nature, and
269 correct behavior is not guaranteed, or may vary from run to run.
270 The same specifiers can be used with a `ResourcePath` object, by
271 being the sole contents in the fragments attribute.
273 Returns
274 -------
275 pipeline: `Pipeline`
276 The pipeline loaded from specified location with appropriate (if
277 any) subsetting
279 Notes
280 -----
281 This method attempts to prune any contracts that contain labels which
282 are not in the declared subset of labels. This pruning is done using a
283 string based matching due to the nature of contracts and may prune more
284 than it should.
285 """
286 # Split up the uri and any labels that were supplied
287 uri, label_specifier = cls._parse_file_specifier(uri)
288 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
290 # If there are labels supplied, only keep those
291 if label_specifier is not None:
292 pipeline = pipeline.subsetFromLabels(label_specifier)
293 return pipeline
295 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
296 """Subset a pipeline to contain only labels specified in labelSpecifier
298 Parameters
299 ----------
300 labelSpecifier : `labelSpecifier`
301 Object containing labels that describes how to subset a pipeline.
303 Returns
304 -------
305 pipeline : `Pipeline`
306 A new pipeline object that is a subset of the old pipeline
308 Raises
309 ------
310 ValueError
311 Raised if there is an issue with specified labels
313 Notes
314 -----
315 This method attempts to prune any contracts that contain labels which
316 are not in the declared subset of labels. This pruning is done using a
317 string based matching due to the nature of contracts and may prune more
318 than it should.
319 """
320 # Labels supplied as a set
321 if labelSpecifier.labels:
322 labelSet = labelSpecifier.labels
323 # Labels supplied as a range, first create a list of all the labels
324 # in the pipeline sorted according to task dependency. Then only
325 # keep labels that lie between the supplied bounds
326 else:
327 # Create a copy of the pipeline to use when assessing the label
328 # ordering. Use a dict for fast searching while preserving order.
329 # Remove contracts so they do not fail in the expansion step. This
330 # is needed because a user may only configure the tasks they intend
331 # to run, which may cause some contracts to fail if they will later
332 # be dropped
333 pipeline = copy.deepcopy(self)
334 pipeline._pipelineIR.contracts = []
335 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
337 # Verify the bounds are in the labels
338 if labelSpecifier.begin is not None:
339 if labelSpecifier.begin not in labels:
340 raise ValueError(
341 f"Beginning of range subset, {labelSpecifier.begin}, not found in "
342 "pipeline definition"
343 )
344 if labelSpecifier.end is not None:
345 if labelSpecifier.end not in labels:
346 raise ValueError(
347 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition"
348 )
350 labelSet = set()
351 for label in labels:
352 if labelSpecifier.begin is not None:
353 if label != labelSpecifier.begin:
354 continue
355 else:
356 labelSpecifier.begin = None
357 labelSet.add(label)
358 if labelSpecifier.end is not None and label == labelSpecifier.end:
359 break
360 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
362 @staticmethod
363 def _parse_file_specifier(uri: ResourcePathExpression) -> Tuple[ResourcePath, Optional[LabelSpecifier]]:
364 """Split appart a uri and any possible label subsets"""
365 if isinstance(uri, str):
366 # This is to support legacy pipelines during transition
367 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
368 if num_replace:
369 warnings.warn(
370 f"The pipeline file {uri} seems to use the legacy : to separate "
371 "labels, this is deprecated and will be removed after June 2021, please use "
372 "# instead.",
373 category=FutureWarning,
374 )
375 if uri.count("#") > 1:
376 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
377 # Everything else can be converted directly to ResourcePath.
378 uri = ResourcePath(uri)
379 label_subset = uri.fragment or None
381 specifier: Optional[LabelSpecifier]
382 if label_subset is not None:
383 label_subset = urllib.parse.unquote(label_subset)
384 args: Dict[str, Union[Set[str], str, None]]
385 # labels supplied as a list
386 if "," in label_subset:
387 if ".." in label_subset:
388 raise ValueError(
389 "Can only specify a list of labels or a rangewhen loading a Pipline not both"
390 )
391 args = {"labels": set(label_subset.split(","))}
392 # labels supplied as a range
393 elif ".." in label_subset:
394 # Try to de-structure the labelSubset, this will fail if more
395 # than one range is specified
396 begin, end, *rest = label_subset.split("..")
397 if rest:
398 raise ValueError("Only one range can be specified when loading a pipeline")
399 args = {"begin": begin if begin else None, "end": end if end else None}
400 # Assume anything else is a single label
401 else:
402 args = {"labels": {label_subset}}
404 specifier = LabelSpecifier(**args)
405 else:
406 specifier = None
408 return uri, specifier
410 @classmethod
411 def fromString(cls, pipeline_string: str) -> Pipeline:
412 """Create a pipeline from string formatted as a pipeline document.
414 Parameters
415 ----------
416 pipeline_string : `str`
417 A string that is formatted according like a pipeline document
419 Returns
420 -------
421 pipeline: `Pipeline`
422 """
423 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
424 return pipeline
426 @classmethod
427 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
428 """Create a pipeline from an already created `PipelineIR` object.
430 Parameters
431 ----------
432 deserialized_pipeline: `PipelineIR`
433 An already created pipeline intermediate representation object
435 Returns
436 -------
437 pipeline: `Pipeline`
438 """
439 pipeline = cls.__new__(cls)
440 pipeline._pipelineIR = deserialized_pipeline
441 return pipeline
443 @classmethod
444 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline:
445 """Create a new pipeline by copying an already existing `Pipeline`.
447 Parameters
448 ----------
449 pipeline: `Pipeline`
450 An already created pipeline intermediate representation object
452 Returns
453 -------
454 pipeline: `Pipeline`
455 """
456 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
458 def __str__(self) -> str:
459 # tasks need sorted each call because someone might have added or
460 # removed task, and caching changes does not seem worth the small
461 # overhead
462 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
463 self._pipelineIR.reorder_tasks(labels)
464 return str(self._pipelineIR)
466 def addInstrument(self, instrument: Union[Instrument, str]) -> None:
467 """Add an instrument to the pipeline, or replace an instrument that is
468 already defined.
470 Parameters
471 ----------
472 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
473 Either a derived class object of a `lsst.daf.butler.instrument` or
474 a string corresponding to a fully qualified
475 `lsst.daf.butler.instrument` name.
476 """
477 if isinstance(instrument, str):
478 pass
479 else:
480 # TODO: assume that this is a subclass of Instrument, no type
481 # checking
482 instrument = get_full_type_name(instrument)
483 self._pipelineIR.instrument = instrument
485 def getInstrument(self) -> Instrument:
486 """Get the instrument from the pipeline.
488 Returns
489 -------
490 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None
491 A derived class object of a `lsst.daf.butler.instrument`, a string
492 corresponding to a fully qualified `lsst.daf.butler.instrument`
493 name, or None if the pipeline does not have an instrument.
494 """
495 return self._pipelineIR.instrument
497 def addTask(self, task: Union[PipelineTask, str], label: str) -> None:
498 """Add a new task to the pipeline, or replace a task that is already
499 associated with the supplied label.
501 Parameters
502 ----------
503 task: `PipelineTask` or `str`
504 Either a derived class object of a `PipelineTask` or a string
505 corresponding to a fully qualified `PipelineTask` name.
506 label: `str`
507 A label that is used to identify the `PipelineTask` being added
508 """
509 if isinstance(task, str):
510 taskName = task
511 elif issubclass(task, PipelineTask):
512 taskName = f"{task.__module__}.{task.__qualname__}"
513 else:
514 raise ValueError(
515 "task must be either a child class of PipelineTask or a string containing"
516 " a fully qualified name to one"
517 )
518 if not label:
519 # in some cases (with command line-generated pipeline) tasks can
520 # be defined without label which is not acceptable, use task
521 # _DefaultName in that case
522 if isinstance(task, str):
523 task = doImport(task)
524 label = task._DefaultName
525 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
527 def removeTask(self, label: str) -> None:
528 """Remove a task from the pipeline.
530 Parameters
531 ----------
532 label : `str`
533 The label used to identify the task that is to be removed
535 Raises
536 ------
537 KeyError
538 If no task with that label exists in the pipeline
540 """
541 self._pipelineIR.tasks.pop(label)
543 def addConfigOverride(self, label: str, key: str, value: object) -> None:
544 """Apply single config override.
546 Parameters
547 ----------
548 label : `str`
549 Label of the task.
550 key: `str`
551 Fully-qualified field name.
552 value : object
553 Value to be given to a field.
554 """
555 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
557 def addConfigFile(self, label: str, filename: str) -> None:
558 """Add overrides from a specified file.
560 Parameters
561 ----------
562 label : `str`
563 The label used to identify the task associated with config to
564 modify
565 filename : `str`
566 Path to the override file.
567 """
568 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
570 def addConfigPython(self, label: str, pythonString: str) -> None:
571 """Add Overrides by running a snippet of python code against a config.
573 Parameters
574 ----------
575 label : `str`
576 The label used to identity the task associated with config to
577 modify.
578 pythonString: `str`
579 A string which is valid python code to be executed. This is done
580 with config as the only local accessible value.
581 """
582 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
584 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
585 if label == "parameters":
586 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys():
587 raise ValueError("Cannot override parameters that are not defined in pipeline")
588 self._pipelineIR.parameters.mapping.update(newConfig.rest)
589 if newConfig.file:
590 raise ValueError("Setting parameters section with config file is not supported")
591 if newConfig.python:
592 raise ValueError("Setting parameters section using python block in unsupported")
593 return
594 if label not in self._pipelineIR.tasks:
595 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
596 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
598 def toFile(self, filename: str) -> None:
599 self._pipelineIR.to_file(filename)
601 def write_to_uri(self, uri: ResourcePathExpression) -> None:
602 """Write the pipeline to a file or directory.
604 Parameters
605 ----------
606 uri : convertible to `ResourcePath`
607 URI to write to; may have any scheme with `ResourcePath` write
608 support or no scheme for a local file/directory. Should have a
609 ``.yaml``.
610 """
611 labels = [td.label for td in self._toExpandedPipelineImpl(checkContracts=False)]
612 self._pipelineIR.reorder_tasks(labels)
613 self._pipelineIR.write_to_uri(uri)
615 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
616 """Returns a generator of TaskDefs which can be used to create quantum
617 graphs.
619 Returns
620 -------
621 generator : generator of `TaskDef`
622 The generator returned will be the sorted iterator of tasks which
623 are to be used in constructing a quantum graph.
625 Raises
626 ------
627 NotImplementedError
628 If a dataId is supplied in a config block. This is in place for
629 future use
630 """
631 yield from self._toExpandedPipelineImpl()
633 def _toExpandedPipelineImpl(self, checkContracts=True) -> Iterable[TaskDef]:
634 taskDefs = []
635 for label in self._pipelineIR.tasks:
636 taskDefs.append(self._buildTaskDef(label))
638 # lets evaluate the contracts
639 if self._pipelineIR.contracts is not None:
640 label_to_config = {x.label: x.config for x in taskDefs}
641 for contract in self._pipelineIR.contracts:
642 # execute this in its own line so it can raise a good error
643 # message if there was problems with the eval
644 success = eval(contract.contract, None, label_to_config)
645 if not success:
646 extra_info = f": {contract.msg}" if contract.msg is not None else ""
647 raise pipelineIR.ContractError(
648 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}"
649 )
651 taskDefs = sorted(taskDefs, key=lambda x: x.label)
652 yield from pipeTools.orderPipeline(taskDefs)
654 def _buildTaskDef(self, label: str) -> TaskDef:
655 if (taskIR := self._pipelineIR.tasks.get(label)) is None:
656 raise NameError(f"Label {label} does not appear in this pipeline")
657 taskClass = doImport(taskIR.klass)
658 taskName = taskClass.__qualname__
659 config = taskClass.ConfigClass()
660 overrides = ConfigOverrides()
661 if self._pipelineIR.instrument is not None:
662 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName)
663 if taskIR.config is not None:
664 for configIR in (configIr.formatted(self._pipelineIR.parameters) for configIr in taskIR.config):
665 if configIR.dataId is not None:
666 raise NotImplementedError(
667 "Specializing a config on a partial data id is not yet "
668 "supported in Pipeline definition"
669 )
670 # only apply override if it applies to everything
671 if configIR.dataId is None:
672 if configIR.file:
673 for configFile in configIR.file:
674 overrides.addFileOverride(os.path.expandvars(configFile))
675 if configIR.python is not None:
676 overrides.addPythonOverride(configIR.python)
677 for key, value in configIR.rest.items():
678 overrides.addValueOverride(key, value)
679 overrides.applyTo(config)
680 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)
682 def __iter__(self) -> Generator[TaskDef, None, None]:
683 return self.toExpandedPipeline()
685 def __getitem__(self, item: str) -> TaskDef:
686 return self._buildTaskDef(item)
688 def __len__(self):
689 return len(self._pipelineIR.tasks)
691 def __eq__(self, other: object):
692 if not isinstance(other, Pipeline):
693 return False
694 return self._pipelineIR == other._pipelineIR
697@dataclass(frozen=True)
698class TaskDatasetTypes:
699 """An immutable struct that extracts and classifies the dataset types used
700 by a `PipelineTask`
701 """
703 initInputs: NamedValueSet[DatasetType]
704 """Dataset types that are needed as inputs in order to construct this Task.
706 Task-level `initInputs` may be classified as either
707 `~PipelineDatasetTypes.initInputs` or
708 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
709 """
711 initOutputs: NamedValueSet[DatasetType]
712 """Dataset types that may be written after constructing this Task.
714 Task-level `initOutputs` may be classified as either
715 `~PipelineDatasetTypes.initOutputs` or
716 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
717 """
719 inputs: NamedValueSet[DatasetType]
720 """Dataset types that are regular inputs to this Task.
722 If an input dataset needed for a Quantum cannot be found in the input
723 collection(s) or produced by another Task in the Pipeline, that Quantum
724 (and all dependent Quanta) will not be produced.
726 Task-level `inputs` may be classified as either
727 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
728 at the Pipeline level.
729 """
731 prerequisites: NamedValueSet[DatasetType]
732 """Dataset types that are prerequisite inputs to this Task.
734 Prerequisite inputs must exist in the input collection(s) before the
735 pipeline is run, but do not constrain the graph - if a prerequisite is
736 missing for a Quantum, `PrerequisiteMissingError` is raised.
738 Prerequisite inputs are not resolved until the second stage of
739 QuantumGraph generation.
740 """
742 outputs: NamedValueSet[DatasetType]
743 """Dataset types that are produced by this Task.
745 Task-level `outputs` may be classified as either
746 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
747 at the Pipeline level.
748 """
750 @classmethod
751 def fromTaskDef(
752 cls,
753 taskDef: TaskDef,
754 *,
755 registry: Registry,
756 include_configs: bool = True,
757 storage_class_mapping: Optional[Mapping[str, str]] = None,
758 ) -> TaskDatasetTypes:
759 """Extract and classify the dataset types from a single `PipelineTask`.
761 Parameters
762 ----------
763 taskDef: `TaskDef`
764 An instance of a `TaskDef` class for a particular `PipelineTask`.
765 registry: `Registry`
766 Registry used to construct normalized `DatasetType` objects and
767 retrieve those that are incomplete.
768 include_configs : `bool`, optional
769 If `True` (default) include config dataset types as
770 ``initOutputs``.
771 storage_class_mapping : `Mapping` of `str` to `StorageClass`, optional
772 If a taskdef contains a component dataset type that is unknown
773 to the registry, its parent StorageClass will be looked up in this
774 mapping if it is supplied. If the mapping does not contain the
775 composite dataset type, or the mapping is not supplied an exception
776 will be raised.
778 Returns
779 -------
780 types: `TaskDatasetTypes`
781 The dataset types used by this task.
783 Raises
784 ------
785 ValueError
786 Raised if dataset type connection definition differs from
787 registry definition.
788 LookupError
789 Raised if component parent StorageClass could not be determined
790 and storage_class_mapping does not contain the composite type, or
791 is set to None.
792 """
794 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]:
795 """Constructs a set of true `DatasetType` objects
797 Parameters
798 ----------
799 connectionType : `str`
800 Name of the connection type to produce a set for, corresponds
801 to an attribute of type `list` on the connection class instance
802 freeze : `bool`, optional
803 If `True`, call `NamedValueSet.freeze` on the object returned.
805 Returns
806 -------
807 datasetTypes : `NamedValueSet`
808 A set of all datasetTypes which correspond to the input
809 connection type specified in the connection class of this
810 `PipelineTask`
812 Raises
813 ------
814 ValueError
815 Raised if dataset type connection definition differs from
816 registry definition.
817 LookupError
818 Raised if component parent StorageClass could not be determined
819 and storage_class_mapping does not contain the composite type,
820 or is set to None.
822 Notes
823 -----
824 This function is a closure over the variables ``registry`` and
825 ``taskDef``, and ``storage_class_mapping``.
826 """
827 datasetTypes = NamedValueSet()
828 for c in iterConnections(taskDef.connections, connectionType):
829 dimensions = set(getattr(c, "dimensions", set()))
830 if "skypix" in dimensions:
831 try:
832 datasetType = registry.getDatasetType(c.name)
833 except LookupError as err:
834 raise LookupError(
835 f"DatasetType '{c.name}' referenced by "
836 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
837 f"placeholder, but does not already exist in the registry. "
838 f"Note that reference catalog names are now used as the dataset "
839 f"type name instead of 'ref_cat'."
840 ) from err
841 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
842 rest2 = set(
843 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension)
844 )
845 if rest1 != rest2:
846 raise ValueError(
847 f"Non-skypix dimensions for dataset type {c.name} declared in "
848 f"connections ({rest1}) are inconsistent with those in "
849 f"registry's version of this dataset ({rest2})."
850 )
851 else:
852 # Component dataset types are not explicitly in the
853 # registry. This complicates consistency checks with
854 # registry and requires we work out the composite storage
855 # class.
856 registryDatasetType = None
857 try:
858 registryDatasetType = registry.getDatasetType(c.name)
859 except KeyError:
860 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
861 if componentName:
862 if storage_class_mapping is None or compositeName not in storage_class_mapping:
863 raise LookupError(
864 "Component parent class cannot be determined, and "
865 "composite name was not in storage class mapping, or no "
866 "storage_class_mapping was supplied"
867 )
868 else:
869 parentStorageClass = storage_class_mapping[compositeName]
870 else:
871 parentStorageClass = None
872 datasetType = c.makeDatasetType(
873 registry.dimensions, parentStorageClass=parentStorageClass
874 )
875 registryDatasetType = datasetType
876 else:
877 datasetType = c.makeDatasetType(
878 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass
879 )
881 if registryDatasetType and datasetType != registryDatasetType:
882 try:
883 # Explicitly check for storage class just to make
884 # more specific message.
885 _ = datasetType.storageClass
886 except KeyError:
887 raise ValueError(
888 "Storage class does not exist for supplied dataset type "
889 f"{datasetType} for {taskDef.label}."
890 ) from None
891 raise ValueError(
892 f"Supplied dataset type ({datasetType}) inconsistent with "
893 f"registry definition ({registryDatasetType}) "
894 f"for {taskDef.label}."
895 )
896 datasetTypes.add(datasetType)
897 if freeze:
898 datasetTypes.freeze()
899 return datasetTypes
901 # optionally add initOutput dataset for config
902 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False)
903 if include_configs:
904 initOutputs.add(
905 DatasetType(
906 taskDef.configDatasetName,
907 registry.dimensions.empty,
908 storageClass="Config",
909 )
910 )
911 initOutputs.freeze()
913 # optionally add output dataset for metadata
914 outputs = makeDatasetTypesSet("outputs", freeze=False)
915 if taskDef.metadataDatasetName is not None:
916 # Metadata is supposed to be of the TaskMetadata type, its
917 # dimensions correspond to a task quantum.
918 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
920 # Allow the storage class definition to be read from the existing
921 # dataset type definition if present.
922 try:
923 current = registry.getDatasetType(taskDef.metadataDatasetName)
924 except KeyError:
925 # No previous definition so use the default.
926 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet"
927 else:
928 storageClass = current.storageClass.name
930 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)}
931 if taskDef.logOutputDatasetName is not None:
932 # Log output dimensions correspond to a task quantum.
933 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
934 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}
936 outputs.freeze()
938 return cls(
939 initInputs=makeDatasetTypesSet("initInputs"),
940 initOutputs=initOutputs,
941 inputs=makeDatasetTypesSet("inputs"),
942 prerequisites=makeDatasetTypesSet("prerequisiteInputs"),
943 outputs=outputs,
944 )
947@dataclass(frozen=True)
948class PipelineDatasetTypes:
949 """An immutable struct that classifies the dataset types used in a
950 `Pipeline`.
951 """
953 packagesDatasetName: ClassVar[str] = "packages"
954 """Name of a dataset type used to save package versions.
955 """
957 initInputs: NamedValueSet[DatasetType]
958 """Dataset types that are needed as inputs in order to construct the Tasks
959 in this Pipeline.
961 This does not include dataset types that are produced when constructing
962 other Tasks in the Pipeline (these are classified as `initIntermediates`).
963 """
965 initOutputs: NamedValueSet[DatasetType]
966 """Dataset types that may be written after constructing the Tasks in this
967 Pipeline.
969 This does not include dataset types that are also used as inputs when
970 constructing other Tasks in the Pipeline (these are classified as
971 `initIntermediates`).
972 """
974 initIntermediates: NamedValueSet[DatasetType]
975 """Dataset types that are both used when constructing one or more Tasks
976 in the Pipeline and produced as a side-effect of constructing another
977 Task in the Pipeline.
978 """
980 inputs: NamedValueSet[DatasetType]
981 """Dataset types that are regular inputs for the full pipeline.
983 If an input dataset needed for a Quantum cannot be found in the input
984 collection(s), that Quantum (and all dependent Quanta) will not be
985 produced.
986 """
988 prerequisites: NamedValueSet[DatasetType]
989 """Dataset types that are prerequisite inputs for the full Pipeline.
991 Prerequisite inputs must exist in the input collection(s) before the
992 pipeline is run, but do not constrain the graph - if a prerequisite is
993 missing for a Quantum, `PrerequisiteMissingError` is raised.
995 Prerequisite inputs are not resolved until the second stage of
996 QuantumGraph generation.
997 """
999 intermediates: NamedValueSet[DatasetType]
1000 """Dataset types that are output by one Task in the Pipeline and consumed
1001 as inputs by one or more other Tasks in the Pipeline.
1002 """
1004 outputs: NamedValueSet[DatasetType]
1005 """Dataset types that are output by a Task in the Pipeline and not consumed
1006 by any other Task in the Pipeline.
1007 """
1009 byTask: Mapping[str, TaskDatasetTypes]
1010 """Per-Task dataset types, keyed by label in the `Pipeline`.
1012 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
1013 neither has been modified since the dataset types were extracted, of
1014 course).
1015 """
1017 @classmethod
1018 def fromPipeline(
1019 cls,
1020 pipeline: Union[Pipeline, Iterable[TaskDef]],
1021 *,
1022 registry: Registry,
1023 include_configs: bool = True,
1024 include_packages: bool = True,
1025 ) -> PipelineDatasetTypes:
1026 """Extract and classify the dataset types from all tasks in a
1027 `Pipeline`.
1029 Parameters
1030 ----------
1031 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1032 A collection of tasks that can be run together.
1033 registry: `Registry`
1034 Registry used to construct normalized `DatasetType` objects and
1035 retrieve those that are incomplete.
1036 include_configs : `bool`, optional
1037 If `True` (default) include config dataset types as
1038 ``initOutputs``.
1039 include_packages : `bool`, optional
1040 If `True` (default) include the dataset type for software package
1041 versions in ``initOutputs``.
1043 Returns
1044 -------
1045 types: `PipelineDatasetTypes`
1046 The dataset types used by this `Pipeline`.
1048 Raises
1049 ------
1050 ValueError
1051 Raised if Tasks are inconsistent about which datasets are marked
1052 prerequisite. This indicates that the Tasks cannot be run as part
1053 of the same `Pipeline`.
1054 """
1055 allInputs = NamedValueSet()
1056 allOutputs = NamedValueSet()
1057 allInitInputs = NamedValueSet()
1058 allInitOutputs = NamedValueSet()
1059 prerequisites = NamedValueSet()
1060 byTask = dict()
1061 if include_packages:
1062 allInitOutputs.add(
1063 DatasetType(
1064 cls.packagesDatasetName,
1065 registry.dimensions.empty,
1066 storageClass="Packages",
1067 )
1068 )
1069 # create a list of TaskDefs in case the input is a generator
1070 pipeline = list(pipeline)
1072 # collect all the output dataset types
1073 typeStorageclassMap: Dict[str, str] = {}
1074 for taskDef in pipeline:
1075 for outConnection in iterConnections(taskDef.connections, "outputs"):
1076 typeStorageclassMap[outConnection.name] = outConnection.storageClass
1078 for taskDef in pipeline:
1079 thisTask = TaskDatasetTypes.fromTaskDef(
1080 taskDef,
1081 registry=registry,
1082 include_configs=include_configs,
1083 storage_class_mapping=typeStorageclassMap,
1084 )
1085 allInitInputs |= thisTask.initInputs
1086 allInitOutputs |= thisTask.initOutputs
1087 allInputs |= thisTask.inputs
1088 prerequisites |= thisTask.prerequisites
1089 allOutputs |= thisTask.outputs
1090 byTask[taskDef.label] = thisTask
1091 if not prerequisites.isdisjoint(allInputs):
1092 raise ValueError(
1093 "{} marked as both prerequisites and regular inputs".format(
1094 {dt.name for dt in allInputs & prerequisites}
1095 )
1096 )
1097 if not prerequisites.isdisjoint(allOutputs):
1098 raise ValueError(
1099 "{} marked as both prerequisites and outputs".format(
1100 {dt.name for dt in allOutputs & prerequisites}
1101 )
1102 )
1103 # Make sure that components which are marked as inputs get treated as
1104 # intermediates if there is an output which produces the composite
1105 # containing the component
1106 intermediateComponents = NamedValueSet()
1107 intermediateComposites = NamedValueSet()
1108 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
1109 for dsType in allInputs:
1110 # get the name of a possible component
1111 name, component = dsType.nameAndComponent()
1112 # if there is a component name, that means this is a component
1113 # DatasetType, if there is an output which produces the parent of
1114 # this component, treat this input as an intermediate
1115 if component is not None:
1116 # This needs to be in this if block, because someone might have
1117 # a composite that is a pure input from existing data
1118 if name in outputNameMapping:
1119 intermediateComponents.add(dsType)
1120 intermediateComposites.add(outputNameMapping[name])
1122 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
1123 common = a.names & b.names
1124 for name in common:
1125 if a[name] != b[name]:
1126 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
1128 checkConsistency(allInitInputs, allInitOutputs)
1129 checkConsistency(allInputs, allOutputs)
1130 checkConsistency(allInputs, intermediateComposites)
1131 checkConsistency(allOutputs, intermediateComposites)
1133 def frozen(s: NamedValueSet) -> NamedValueSet:
1134 s.freeze()
1135 return s
1137 return cls(
1138 initInputs=frozen(allInitInputs - allInitOutputs),
1139 initIntermediates=frozen(allInitInputs & allInitOutputs),
1140 initOutputs=frozen(allInitOutputs - allInitInputs),
1141 inputs=frozen(allInputs - allOutputs - intermediateComponents),
1142 intermediates=frozen(allInputs & allOutputs | intermediateComponents),
1143 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1144 prerequisites=frozen(prerequisites),
1145 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1146 )
1148 @classmethod
1149 def initOutputNames(
1150 cls,
1151 pipeline: Union[Pipeline, Iterable[TaskDef]],
1152 *,
1153 include_configs: bool = True,
1154 include_packages: bool = True,
1155 ) -> Iterator[str]:
1156 """Return the names of dataset types ot task initOutputs, Configs,
1157 and package versions for a pipeline.
1159 Parameters
1160 ----------
1161 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1162 A `Pipeline` instance or collection of `TaskDef` instances.
1163 include_configs : `bool`, optional
1164 If `True` (default) include config dataset types.
1165 include_packages : `bool`, optional
1166 If `True` (default) include the dataset type for package versions.
1168 Yields
1169 ------
1170 datasetTypeName : `str`
1171 Name of the dataset type.
1172 """
1173 if include_packages:
1174 # Package versions dataset type
1175 yield cls.packagesDatasetName
1177 if isinstance(pipeline, Pipeline):
1178 pipeline = pipeline.toExpandedPipeline()
1180 for taskDef in pipeline:
1182 # all task InitOutputs
1183 for name in taskDef.connections.initOutputs:
1184 attribute = getattr(taskDef.connections, name)
1185 yield attribute.name
1187 # config dataset name
1188 if include_configs:
1189 yield taskDef.configDatasetName