Coverage for python/lsst/pipe/base/pipeline.py: 20%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining Pipeline class and related methods.
24"""
26__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31from dataclasses import dataclass
32import logging
33from types import MappingProxyType
34from typing import (ClassVar, Dict, Iterable, Iterator, Mapping, Set, Union,
35 Generator, TYPE_CHECKING, Optional, Tuple)
37import copy
38import re
39import os
40import urllib.parse
41import warnings
43# -----------------------------
44# Imports for other modules --
45from lsst.daf.butler import DatasetType, NamedValueSet, Registry, SkyPixDimension, ButlerURI
46from lsst.utils import doImport
47from .configOverrides import ConfigOverrides
48from .connections import iterConnections
49from .pipelineTask import PipelineTask
51from . import pipelineIR
52from . import pipeTools
54if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 from lsst.obs.base import Instrument
57# ----------------------------------
58# Local non-exported definitions --
59# ----------------------------------
61_LOG = logging.getLogger(__name__)
63# ------------------------
64# Exported definitions --
65# ------------------------
68@dataclass
69class LabelSpecifier:
70 """A structure to specify a subset of labels to load
72 This structure may contain a set of labels to be used in subsetting a
73 pipeline, or a beginning and end point. Beginning or end may be empty,
74 in which case the range will be a half open interval. Unlike python
75 iteration bounds, end bounds are *INCLUDED*. Note that range based
76 selection is not well defined for pipelines that are not linear in nature,
77 and correct behavior is not guaranteed, or may vary from run to run.
78 """
79 labels: Optional[Set[str]] = None
80 begin: Optional[str] = None
81 end: Optional[str] = None
83 def __post_init__(self):
84 if self.labels is not None and (self.begin or self.end):
85 raise ValueError("This struct can only be initialized with a labels set or "
86 "a begin (and/or) end specifier")
89class TaskDef:
90 """TaskDef is a collection of information about task needed by Pipeline.
92 The information includes task name, configuration object and optional
93 task class. This class is just a collection of attributes and it exposes
94 all of them so that attributes could potentially be modified in place
95 (e.g. if configuration needs extra overrides).
97 Attributes
98 ----------
99 taskName : `str`
100 `PipelineTask` class name, currently it is not specified whether this
101 is a fully-qualified name or partial name (e.g. ``module.TaskClass``).
102 Framework should be prepared to handle all cases.
103 config : `lsst.pex.config.Config`
104 Instance of the configuration class corresponding to this task class,
105 usually with all overrides applied. This config will be frozen.
106 taskClass : `type` or ``None``
107 `PipelineTask` class object, can be ``None``. If ``None`` then
108 framework will have to locate and load class.
109 label : `str`, optional
110 Task label, usually a short string unique in a pipeline.
111 """
112 def __init__(self, taskName, config, taskClass=None, label=""):
113 self.taskName = taskName
114 config.freeze()
115 self.config = config
116 self.taskClass = taskClass
117 self.label = label
118 self.connections = config.connections.ConnectionsClass(config=config)
120 @property
121 def configDatasetName(self) -> str:
122 """Name of a dataset type for configuration of this task (`str`)
123 """
124 return self.label + "_config"
126 @property
127 def metadataDatasetName(self) -> Optional[str]:
128 """Name of a dataset type for metadata of this task, `None` if
129 metadata is not to be saved (`str`)
130 """
131 if self.config.saveMetadata:
132 return self.label + "_metadata"
133 else:
134 return None
136 @property
137 def logOutputDatasetName(self) -> Optional[str]:
138 """Name of a dataset type for log output from this task, `None` if
139 logs are not to be saved (`str`)
140 """
141 if self.config.saveLogOutput:
142 return self.label + "_log"
143 else:
144 return None
146 def __str__(self):
147 rep = "TaskDef(" + self.taskName
148 if self.label:
149 rep += ", label=" + self.label
150 rep += ")"
151 return rep
153 def __eq__(self, other: object) -> bool:
154 if not isinstance(other, TaskDef):
155 return False
156 # This does not consider equality of configs when determining equality
157 # as config equality is a difficult thing to define. Should be updated
158 # after DM-27847
159 return self.taskClass == other.taskClass and self.label == other.label
161 def __hash__(self):
162 return hash((self.taskClass, self.label))
165class Pipeline:
166 """A `Pipeline` is a representation of a series of tasks to run, and the
167 configuration for those tasks.
169 Parameters
170 ----------
171 description : `str`
172 A description of that this pipeline does.
173 """
174 def __init__(self, description: str):
175 pipeline_dict = {"description": description, "tasks": {}}
176 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
178 @classmethod
179 def fromFile(cls, filename: str) -> Pipeline:
180 """Load a pipeline defined in a pipeline yaml file.
182 Parameters
183 ----------
184 filename: `str`
185 A path that points to a pipeline defined in yaml format. This
186 filename may also supply additional labels to be used in
187 subsetting the loaded Pipeline. These labels are separated from
188 the path by a \\#, and may be specified as a comma separated
189 list, or a range denoted as beginning..end. Beginning or end may
190 be empty, in which case the range will be a half open interval.
191 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
192 that range based selection is not well defined for pipelines that
193 are not linear in nature, and correct behavior is not guaranteed,
194 or may vary from run to run.
196 Returns
197 -------
198 pipeline: `Pipeline`
199 The pipeline loaded from specified location with appropriate (if
200 any) subsetting
202 Notes
203 -----
204 This method attempts to prune any contracts that contain labels which
205 are not in the declared subset of labels. This pruning is done using a
206 string based matching due to the nature of contracts and may prune more
207 than it should.
208 """
209 return cls.from_uri(filename)
211 @classmethod
212 def from_uri(cls, uri: Union[str, ButlerURI]) -> Pipeline:
213 """Load a pipeline defined in a pipeline yaml file at a location
214 specified by a URI.
216 Parameters
217 ----------
218 uri: `str` or `ButlerURI`
219 If a string is supplied this should be a URI path that points to a
220 pipeline defined in yaml format. This uri may also supply
221 additional labels to be used in subsetting the loaded Pipeline.
222 These labels are separated from the path by a \\#, and may be
223 specified as a comma separated list, or a range denoted as
224 beginning..end. Beginning or end may be empty, in which case the
225 range will be a half open interval. Unlike python iteration
226 bounds, end bounds are *INCLUDED*. Note that range based selection
227 is not well defined for pipelines that are not linear in nature,
228 and correct behavior is not guaranteed, or may vary from run to
229 run. The same specifiers can be used with a ButlerURI object, by
230 being the sole contents in the fragments attribute.
232 Returns
233 -------
234 pipeline: `Pipeline`
235 The pipeline loaded from specified location with appropriate (if
236 any) subsetting
238 Notes
239 -----
240 This method attempts to prune any contracts that contain labels which
241 are not in the declared subset of labels. This pruning is done using a
242 string based matching due to the nature of contracts and may prune more
243 than it should.
244 """
245 # Split up the uri and any labels that were supplied
246 uri, label_specifier = cls._parse_file_specifier(uri)
247 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
249 # If there are labels supplied, only keep those
250 if label_specifier is not None:
251 pipeline = pipeline.subsetFromLabels(label_specifier)
252 return pipeline
254 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
255 """Subset a pipeline to contain only labels specified in labelSpecifier
257 Parameters
258 ----------
259 labelSpecifier : `labelSpecifier`
260 Object containing labels that describes how to subset a pipeline.
262 Returns
263 -------
264 pipeline : `Pipeline`
265 A new pipeline object that is a subset of the old pipeline
267 Raises
268 ------
269 ValueError
270 Raised if there is an issue with specified labels
272 Notes
273 -----
274 This method attempts to prune any contracts that contain labels which
275 are not in the declared subset of labels. This pruning is done using a
276 string based matching due to the nature of contracts and may prune more
277 than it should.
278 """
279 # Labels supplied as a set
280 if labelSpecifier.labels:
281 labelSet = labelSpecifier.labels
282 # Labels supplied as a range, first create a list of all the labels
283 # in the pipeline sorted according to task dependency. Then only
284 # keep labels that lie between the supplied bounds
285 else:
286 # Create a copy of the pipeline to use when assessing the label
287 # ordering. Use a dict for fast searching while preserving order.
288 # Remove contracts so they do not fail in the expansion step. This
289 # is needed because a user may only configure the tasks they intend
290 # to run, which may cause some contracts to fail if they will later
291 # be dropped
292 pipeline = copy.deepcopy(self)
293 pipeline._pipelineIR.contracts = []
294 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
296 # Verify the bounds are in the labels
297 if labelSpecifier.begin is not None:
298 if labelSpecifier.begin not in labels:
299 raise ValueError(f"Beginning of range subset, {labelSpecifier.begin}, not found in "
300 "pipeline definition")
301 if labelSpecifier.end is not None:
302 if labelSpecifier.end not in labels:
303 raise ValueError(f"End of range subset, {labelSpecifier.end}, not found in pipeline "
304 "definition")
306 labelSet = set()
307 for label in labels:
308 if labelSpecifier.begin is not None:
309 if label != labelSpecifier.begin:
310 continue
311 else:
312 labelSpecifier.begin = None
313 labelSet.add(label)
314 if labelSpecifier.end is not None and label == labelSpecifier.end:
315 break
316 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
318 @staticmethod
319 def _parse_file_specifier(uri: Union[str, ButlerURI]
320 ) -> Tuple[ButlerURI, Optional[LabelSpecifier]]:
321 """Split appart a uri and any possible label subsets
322 """
323 if isinstance(uri, str):
324 # This is to support legacy pipelines during transition
325 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
326 if num_replace:
327 warnings.warn(f"The pipeline file {uri} seems to use the legacy : to separate "
328 "labels, this is deprecated and will be removed after June 2021, please use "
329 "# instead.",
330 category=FutureWarning)
331 if uri.count("#") > 1:
332 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
333 uri = ButlerURI(uri)
334 label_subset = uri.fragment or None
336 specifier: Optional[LabelSpecifier]
337 if label_subset is not None:
338 label_subset = urllib.parse.unquote(label_subset)
339 args: Dict[str, Union[Set[str], str, None]]
340 # labels supplied as a list
341 if ',' in label_subset:
342 if '..' in label_subset:
343 raise ValueError("Can only specify a list of labels or a range"
344 "when loading a Pipline not both")
345 args = {"labels": set(label_subset.split(","))}
346 # labels supplied as a range
347 elif '..' in label_subset:
348 # Try to de-structure the labelSubset, this will fail if more
349 # than one range is specified
350 begin, end, *rest = label_subset.split("..")
351 if rest:
352 raise ValueError("Only one range can be specified when loading a pipeline")
353 args = {"begin": begin if begin else None, "end": end if end else None}
354 # Assume anything else is a single label
355 else:
356 args = {"labels": {label_subset}}
358 specifier = LabelSpecifier(**args)
359 else:
360 specifier = None
362 return uri, specifier
364 @classmethod
365 def fromString(cls, pipeline_string: str) -> Pipeline:
366 """Create a pipeline from string formatted as a pipeline document.
368 Parameters
369 ----------
370 pipeline_string : `str`
371 A string that is formatted according like a pipeline document
373 Returns
374 -------
375 pipeline: `Pipeline`
376 """
377 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
378 return pipeline
380 @classmethod
381 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
382 """Create a pipeline from an already created `PipelineIR` object.
384 Parameters
385 ----------
386 deserialized_pipeline: `PipelineIR`
387 An already created pipeline intermediate representation object
389 Returns
390 -------
391 pipeline: `Pipeline`
392 """
393 pipeline = cls.__new__(cls)
394 pipeline._pipelineIR = deserialized_pipeline
395 return pipeline
397 @classmethod
398 def fromPipeline(cls, pipeline: pipelineIR.PipelineIR) -> Pipeline:
399 """Create a new pipeline by copying an already existing `Pipeline`.
401 Parameters
402 ----------
403 pipeline: `Pipeline`
404 An already created pipeline intermediate representation object
406 Returns
407 -------
408 pipeline: `Pipeline`
409 """
410 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
412 def __str__(self) -> str:
413 return str(self._pipelineIR)
415 def addInstrument(self, instrument: Union[Instrument, str]) -> None:
416 """Add an instrument to the pipeline, or replace an instrument that is
417 already defined.
419 Parameters
420 ----------
421 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
422 Either a derived class object of a `lsst.daf.butler.instrument` or
423 a string corresponding to a fully qualified
424 `lsst.daf.butler.instrument` name.
425 """
426 if isinstance(instrument, str):
427 pass
428 else:
429 # TODO: assume that this is a subclass of Instrument, no type
430 # checking
431 instrument = f"{instrument.__module__}.{instrument.__qualname__}"
432 self._pipelineIR.instrument = instrument
434 def getInstrument(self) -> Instrument:
435 """Get the instrument from the pipeline.
437 Returns
438 -------
439 instrument : `~lsst.daf.butler.instrument.Instrument`, `str`, or None
440 A derived class object of a `lsst.daf.butler.instrument`, a string
441 corresponding to a fully qualified `lsst.daf.butler.instrument`
442 name, or None if the pipeline does not have an instrument.
443 """
444 return self._pipelineIR.instrument
446 def addTask(self, task: Union[PipelineTask, str], label: str) -> None:
447 """Add a new task to the pipeline, or replace a task that is already
448 associated with the supplied label.
450 Parameters
451 ----------
452 task: `PipelineTask` or `str`
453 Either a derived class object of a `PipelineTask` or a string
454 corresponding to a fully qualified `PipelineTask` name.
455 label: `str`
456 A label that is used to identify the `PipelineTask` being added
457 """
458 if isinstance(task, str):
459 taskName = task
460 elif issubclass(task, PipelineTask):
461 taskName = f"{task.__module__}.{task.__qualname__}"
462 else:
463 raise ValueError("task must be either a child class of PipelineTask or a string containing"
464 " a fully qualified name to one")
465 if not label:
466 # in some cases (with command line-generated pipeline) tasks can
467 # be defined without label which is not acceptable, use task
468 # _DefaultName in that case
469 if isinstance(task, str):
470 task = doImport(task)
471 label = task._DefaultName
472 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
474 def removeTask(self, label: str) -> None:
475 """Remove a task from the pipeline.
477 Parameters
478 ----------
479 label : `str`
480 The label used to identify the task that is to be removed
482 Raises
483 ------
484 KeyError
485 If no task with that label exists in the pipeline
487 """
488 self._pipelineIR.tasks.pop(label)
490 def addConfigOverride(self, label: str, key: str, value: object) -> None:
491 """Apply single config override.
493 Parameters
494 ----------
495 label : `str`
496 Label of the task.
497 key: `str`
498 Fully-qualified field name.
499 value : object
500 Value to be given to a field.
501 """
502 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
504 def addConfigFile(self, label: str, filename: str) -> None:
505 """Add overrides from a specified file.
507 Parameters
508 ----------
509 label : `str`
510 The label used to identify the task associated with config to
511 modify
512 filename : `str`
513 Path to the override file.
514 """
515 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
517 def addConfigPython(self, label: str, pythonString: str) -> None:
518 """Add Overrides by running a snippet of python code against a config.
520 Parameters
521 ----------
522 label : `str`
523 The label used to identity the task associated with config to
524 modify.
525 pythonString: `str`
526 A string which is valid python code to be executed. This is done
527 with config as the only local accessible value.
528 """
529 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
531 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
532 if label == "parameters":
533 if newConfig.rest.keys() - self._pipelineIR.parameters.mapping.keys():
534 raise ValueError("Cannot override parameters that are not defined in pipeline")
535 self._pipelineIR.parameters.mapping.update(newConfig.rest)
536 if newConfig.file:
537 raise ValueError("Setting parameters section with config file is not supported")
538 if newConfig.python:
539 raise ValueError("Setting parameters section using python block in unsupported")
540 return
541 if label not in self._pipelineIR.tasks:
542 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
543 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
545 def toFile(self, filename: str) -> None:
546 self._pipelineIR.to_file(filename)
548 def write_to_uri(self, uri: Union[str, ButlerURI]) -> None:
549 self._pipelineIR.write_to_uri(uri)
551 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
552 """Returns a generator of TaskDefs which can be used to create quantum
553 graphs.
555 Returns
556 -------
557 generator : generator of `TaskDef`
558 The generator returned will be the sorted iterator of tasks which
559 are to be used in constructing a quantum graph.
561 Raises
562 ------
563 NotImplementedError
564 If a dataId is supplied in a config block. This is in place for
565 future use
566 """
567 taskDefs = []
568 for label, taskIR in self._pipelineIR.tasks.items():
569 taskClass = doImport(taskIR.klass)
570 taskName = taskClass.__qualname__
571 config = taskClass.ConfigClass()
572 overrides = ConfigOverrides()
573 if self._pipelineIR.instrument is not None:
574 overrides.addInstrumentOverride(self._pipelineIR.instrument, taskClass._DefaultName)
575 if taskIR.config is not None:
576 for configIR in (configIr.formatted(self._pipelineIR.parameters)
577 for configIr in taskIR.config):
578 if configIR.dataId is not None:
579 raise NotImplementedError("Specializing a config on a partial data id is not yet "
580 "supported in Pipeline definition")
581 # only apply override if it applies to everything
582 if configIR.dataId is None:
583 if configIR.file:
584 for configFile in configIR.file:
585 overrides.addFileOverride(os.path.expandvars(configFile))
586 if configIR.python is not None:
587 overrides.addPythonOverride(configIR.python)
588 for key, value in configIR.rest.items():
589 overrides.addValueOverride(key, value)
590 overrides.applyTo(config)
591 # This may need to be revisited
592 try:
593 config.validate()
594 except Exception:
595 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
596 raise
597 taskDefs.append(TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label))
599 # lets evaluate the contracts
600 if self._pipelineIR.contracts is not None:
601 label_to_config = {x.label: x.config for x in taskDefs}
602 for contract in self._pipelineIR.contracts:
603 # execute this in its own line so it can raise a good error
604 # message if there was problems with the eval
605 success = eval(contract.contract, None, label_to_config)
606 if not success:
607 extra_info = f": {contract.msg}" if contract.msg is not None else ""
608 raise pipelineIR.ContractError(f"Contract(s) '{contract.contract}' were not "
609 f"satisfied{extra_info}")
611 yield from pipeTools.orderPipeline(taskDefs)
613 def __len__(self):
614 return len(self._pipelineIR.tasks)
616 def __eq__(self, other: object):
617 if not isinstance(other, Pipeline):
618 return False
619 return self._pipelineIR == other._pipelineIR
622@dataclass(frozen=True)
623class TaskDatasetTypes:
624 """An immutable struct that extracts and classifies the dataset types used
625 by a `PipelineTask`
626 """
628 initInputs: NamedValueSet[DatasetType]
629 """Dataset types that are needed as inputs in order to construct this Task.
631 Task-level `initInputs` may be classified as either
632 `~PipelineDatasetTypes.initInputs` or
633 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
634 """
636 initOutputs: NamedValueSet[DatasetType]
637 """Dataset types that may be written after constructing this Task.
639 Task-level `initOutputs` may be classified as either
640 `~PipelineDatasetTypes.initOutputs` or
641 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
642 """
644 inputs: NamedValueSet[DatasetType]
645 """Dataset types that are regular inputs to this Task.
647 If an input dataset needed for a Quantum cannot be found in the input
648 collection(s) or produced by another Task in the Pipeline, that Quantum
649 (and all dependent Quanta) will not be produced.
651 Task-level `inputs` may be classified as either
652 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
653 at the Pipeline level.
654 """
656 prerequisites: NamedValueSet[DatasetType]
657 """Dataset types that are prerequisite inputs to this Task.
659 Prerequisite inputs must exist in the input collection(s) before the
660 pipeline is run, but do not constrain the graph - if a prerequisite is
661 missing for a Quantum, `PrerequisiteMissingError` is raised.
663 Prerequisite inputs are not resolved until the second stage of
664 QuantumGraph generation.
665 """
667 outputs: NamedValueSet[DatasetType]
668 """Dataset types that are produced by this Task.
670 Task-level `outputs` may be classified as either
671 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
672 at the Pipeline level.
673 """
675 @classmethod
676 def fromTaskDef(
677 cls,
678 taskDef: TaskDef,
679 *,
680 registry: Registry,
681 include_configs: bool = True,
682 ) -> TaskDatasetTypes:
683 """Extract and classify the dataset types from a single `PipelineTask`.
685 Parameters
686 ----------
687 taskDef: `TaskDef`
688 An instance of a `TaskDef` class for a particular `PipelineTask`.
689 registry: `Registry`
690 Registry used to construct normalized `DatasetType` objects and
691 retrieve those that are incomplete.
692 include_configs : `bool`, optional
693 If `True` (default) include config dataset types as
694 ``initOutputs``.
696 Returns
697 -------
698 types: `TaskDatasetTypes`
699 The dataset types used by this task.
700 """
701 def makeDatasetTypesSet(connectionType: str, freeze: bool = True) -> NamedValueSet[DatasetType]:
702 """Constructs a set of true `DatasetType` objects
704 Parameters
705 ----------
706 connectionType : `str`
707 Name of the connection type to produce a set for, corresponds
708 to an attribute of type `list` on the connection class instance
709 freeze : `bool`, optional
710 If `True`, call `NamedValueSet.freeze` on the object returned.
712 Returns
713 -------
714 datasetTypes : `NamedValueSet`
715 A set of all datasetTypes which correspond to the input
716 connection type specified in the connection class of this
717 `PipelineTask`
719 Notes
720 -----
721 This function is a closure over the variables ``registry`` and
722 ``taskDef``.
723 """
724 datasetTypes = NamedValueSet()
725 for c in iterConnections(taskDef.connections, connectionType):
726 dimensions = set(getattr(c, 'dimensions', set()))
727 if "skypix" in dimensions:
728 try:
729 datasetType = registry.getDatasetType(c.name)
730 except LookupError as err:
731 raise LookupError(
732 f"DatasetType '{c.name}' referenced by "
733 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
734 f"placeholder, but does not already exist in the registry. "
735 f"Note that reference catalog names are now used as the dataset "
736 f"type name instead of 'ref_cat'."
737 ) from err
738 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
739 rest2 = set(dim.name for dim in datasetType.dimensions
740 if not isinstance(dim, SkyPixDimension))
741 if rest1 != rest2:
742 raise ValueError(f"Non-skypix dimensions for dataset type {c.name} declared in "
743 f"connections ({rest1}) are inconsistent with those in "
744 f"registry's version of this dataset ({rest2}).")
745 else:
746 # Component dataset types are not explicitly in the
747 # registry. This complicates consistency checks with
748 # registry and requires we work out the composite storage
749 # class.
750 registryDatasetType = None
751 try:
752 registryDatasetType = registry.getDatasetType(c.name)
753 except KeyError:
754 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
755 parentStorageClass = DatasetType.PlaceholderParentStorageClass \
756 if componentName else None
757 datasetType = c.makeDatasetType(
758 registry.dimensions,
759 parentStorageClass=parentStorageClass
760 )
761 registryDatasetType = datasetType
762 else:
763 datasetType = c.makeDatasetType(
764 registry.dimensions,
765 parentStorageClass=registryDatasetType.parentStorageClass
766 )
768 if registryDatasetType and datasetType != registryDatasetType:
769 try:
770 # Explicitly check for storage class just to make
771 # more specific message.
772 _ = datasetType.storageClass
773 except KeyError:
774 raise ValueError("Storage class does not exist for supplied dataset type "
775 f"{datasetType} for {taskDef.label}.") from None
776 raise ValueError(f"Supplied dataset type ({datasetType}) inconsistent with "
777 f"registry definition ({registryDatasetType}) "
778 f"for {taskDef.label}.")
779 datasetTypes.add(datasetType)
780 if freeze:
781 datasetTypes.freeze()
782 return datasetTypes
784 # optionally add initOutput dataset for config
785 initOutputs = makeDatasetTypesSet("initOutputs", freeze=False)
786 if include_configs:
787 initOutputs.add(
788 DatasetType(
789 taskDef.configDatasetName,
790 registry.dimensions.empty,
791 storageClass="Config",
792 )
793 )
794 initOutputs.freeze()
796 # optionally add output dataset for metadata
797 outputs = makeDatasetTypesSet("outputs", freeze=False)
798 if taskDef.metadataDatasetName is not None:
799 # Metadata is supposed to be of the PropertySet type, its
800 # dimensions correspond to a task quantum
801 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
802 outputs |= {DatasetType(taskDef.metadataDatasetName, dimensions, "PropertySet")}
803 if taskDef.logOutputDatasetName is not None:
804 # Log output dimensions correspond to a task quantum.
805 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
806 outputs |= {DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")}
808 outputs.freeze()
810 return cls(
811 initInputs=makeDatasetTypesSet("initInputs"),
812 initOutputs=initOutputs,
813 inputs=makeDatasetTypesSet("inputs"),
814 prerequisites=makeDatasetTypesSet("prerequisiteInputs"),
815 outputs=outputs,
816 )
819@dataclass(frozen=True)
820class PipelineDatasetTypes:
821 """An immutable struct that classifies the dataset types used in a
822 `Pipeline`.
823 """
825 packagesDatasetName: ClassVar[str] = "packages"
826 """Name of a dataset type used to save package versions.
827 """
829 initInputs: NamedValueSet[DatasetType]
830 """Dataset types that are needed as inputs in order to construct the Tasks
831 in this Pipeline.
833 This does not include dataset types that are produced when constructing
834 other Tasks in the Pipeline (these are classified as `initIntermediates`).
835 """
837 initOutputs: NamedValueSet[DatasetType]
838 """Dataset types that may be written after constructing the Tasks in this
839 Pipeline.
841 This does not include dataset types that are also used as inputs when
842 constructing other Tasks in the Pipeline (these are classified as
843 `initIntermediates`).
844 """
846 initIntermediates: NamedValueSet[DatasetType]
847 """Dataset types that are both used when constructing one or more Tasks
848 in the Pipeline and produced as a side-effect of constructing another
849 Task in the Pipeline.
850 """
852 inputs: NamedValueSet[DatasetType]
853 """Dataset types that are regular inputs for the full pipeline.
855 If an input dataset needed for a Quantum cannot be found in the input
856 collection(s), that Quantum (and all dependent Quanta) will not be
857 produced.
858 """
860 prerequisites: NamedValueSet[DatasetType]
861 """Dataset types that are prerequisite inputs for the full Pipeline.
863 Prerequisite inputs must exist in the input collection(s) before the
864 pipeline is run, but do not constrain the graph - if a prerequisite is
865 missing for a Quantum, `PrerequisiteMissingError` is raised.
867 Prerequisite inputs are not resolved until the second stage of
868 QuantumGraph generation.
869 """
871 intermediates: NamedValueSet[DatasetType]
872 """Dataset types that are output by one Task in the Pipeline and consumed
873 as inputs by one or more other Tasks in the Pipeline.
874 """
876 outputs: NamedValueSet[DatasetType]
877 """Dataset types that are output by a Task in the Pipeline and not consumed
878 by any other Task in the Pipeline.
879 """
881 byTask: Mapping[str, TaskDatasetTypes]
882 """Per-Task dataset types, keyed by label in the `Pipeline`.
884 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
885 neither has been modified since the dataset types were extracted, of
886 course).
887 """
889 @classmethod
890 def fromPipeline(
891 cls,
892 pipeline: Union[Pipeline, Iterable[TaskDef]],
893 *,
894 registry: Registry,
895 include_configs: bool = True,
896 include_packages: bool = True,
897 ) -> PipelineDatasetTypes:
898 """Extract and classify the dataset types from all tasks in a
899 `Pipeline`.
901 Parameters
902 ----------
903 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
904 A dependency-ordered collection of tasks that can be run
905 together.
906 registry: `Registry`
907 Registry used to construct normalized `DatasetType` objects and
908 retrieve those that are incomplete.
909 include_configs : `bool`, optional
910 If `True` (default) include config dataset types as
911 ``initOutputs``.
912 include_packages : `bool`, optional
913 If `True` (default) include the dataset type for software package
914 versions in ``initOutputs``.
916 Returns
917 -------
918 types: `PipelineDatasetTypes`
919 The dataset types used by this `Pipeline`.
921 Raises
922 ------
923 ValueError
924 Raised if Tasks are inconsistent about which datasets are marked
925 prerequisite. This indicates that the Tasks cannot be run as part
926 of the same `Pipeline`.
927 """
928 allInputs = NamedValueSet()
929 allOutputs = NamedValueSet()
930 allInitInputs = NamedValueSet()
931 allInitOutputs = NamedValueSet()
932 prerequisites = NamedValueSet()
933 byTask = dict()
934 if include_packages:
935 allInitOutputs.add(
936 DatasetType(
937 cls.packagesDatasetName,
938 registry.dimensions.empty,
939 storageClass="Packages",
940 )
941 )
942 if isinstance(pipeline, Pipeline):
943 pipeline = pipeline.toExpandedPipeline()
944 for taskDef in pipeline:
945 thisTask = TaskDatasetTypes.fromTaskDef(
946 taskDef,
947 registry=registry,
948 include_configs=include_configs,
949 )
950 allInitInputs |= thisTask.initInputs
951 allInitOutputs |= thisTask.initOutputs
952 allInputs |= thisTask.inputs
953 prerequisites |= thisTask.prerequisites
954 allOutputs |= thisTask.outputs
955 byTask[taskDef.label] = thisTask
956 if not prerequisites.isdisjoint(allInputs):
957 raise ValueError("{} marked as both prerequisites and regular inputs".format(
958 {dt.name for dt in allInputs & prerequisites}
959 ))
960 if not prerequisites.isdisjoint(allOutputs):
961 raise ValueError("{} marked as both prerequisites and outputs".format(
962 {dt.name for dt in allOutputs & prerequisites}
963 ))
964 # Make sure that components which are marked as inputs get treated as
965 # intermediates if there is an output which produces the composite
966 # containing the component
967 intermediateComponents = NamedValueSet()
968 intermediateComposites = NamedValueSet()
969 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
970 for dsType in allInputs:
971 # get the name of a possible component
972 name, component = dsType.nameAndComponent()
973 # if there is a component name, that means this is a component
974 # DatasetType, if there is an output which produces the parent of
975 # this component, treat this input as an intermediate
976 if component is not None:
977 if name in outputNameMapping:
978 if outputNameMapping[name].dimensions != dsType.dimensions:
979 raise ValueError(f"Component dataset type {dsType.name} has different "
980 f"dimensions ({dsType.dimensions}) than its parent "
981 f"({outputNameMapping[name].dimensions}).")
982 composite = DatasetType(name, dsType.dimensions, outputNameMapping[name].storageClass,
983 universe=registry.dimensions)
984 intermediateComponents.add(dsType)
985 intermediateComposites.add(composite)
987 def checkConsistency(a: NamedValueSet, b: NamedValueSet):
988 common = a.names & b.names
989 for name in common:
990 if a[name] != b[name]:
991 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
993 checkConsistency(allInitInputs, allInitOutputs)
994 checkConsistency(allInputs, allOutputs)
995 checkConsistency(allInputs, intermediateComposites)
996 checkConsistency(allOutputs, intermediateComposites)
998 def frozen(s: NamedValueSet) -> NamedValueSet:
999 s.freeze()
1000 return s
1002 return cls(
1003 initInputs=frozen(allInitInputs - allInitOutputs),
1004 initIntermediates=frozen(allInitInputs & allInitOutputs),
1005 initOutputs=frozen(allInitOutputs - allInitInputs),
1006 inputs=frozen(allInputs - allOutputs - intermediateComponents),
1007 intermediates=frozen(allInputs & allOutputs | intermediateComponents),
1008 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1009 prerequisites=frozen(prerequisites),
1010 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1011 )
1013 @classmethod
1014 def initOutputNames(cls, pipeline: Union[Pipeline, Iterable[TaskDef]], *,
1015 include_configs: bool = True, include_packages: bool = True) -> Iterator[str]:
1016 """Return the names of dataset types ot task initOutputs, Configs,
1017 and package versions for a pipeline.
1019 Parameters
1020 ----------
1021 pipeline: `Pipeline` or `Iterable` [ `TaskDef` ]
1022 A `Pipeline` instance or collection of `TaskDef` instances.
1023 include_configs : `bool`, optional
1024 If `True` (default) include config dataset types.
1025 include_packages : `bool`, optional
1026 If `True` (default) include the dataset type for package versions.
1028 Yields
1029 ------
1030 datasetTypeName : `str`
1031 Name of the dataset type.
1032 """
1033 if include_packages:
1034 # Package versions dataset type
1035 yield cls.packagesDatasetName
1037 if isinstance(pipeline, Pipeline):
1038 pipeline = pipeline.toExpandedPipeline()
1040 for taskDef in pipeline:
1042 # all task InitOutputs
1043 for name in taskDef.connections.initOutputs:
1044 attribute = getattr(taskDef.connections, name)
1045 yield attribute.name
1047 # config dataset name
1048 if include_configs:
1049 yield taskDef.configDatasetName