Coverage for python/lsst/pipe/base/pipeline.py: 21%
439 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 11:14 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 11:14 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining Pipeline class and related methods.
23"""
25from __future__ import annotations
27__all__ = ["Pipeline", "TaskDef", "TaskDatasetTypes", "PipelineDatasetTypes", "LabelSpecifier"]
29import copy
30import logging
31import re
32import urllib.parse
34# -------------------------------
35# Imports of standard modules --
36# -------------------------------
37from collections.abc import Callable, Generator, Iterable, Iterator, Mapping, Set
38from dataclasses import dataclass
39from types import MappingProxyType
40from typing import TYPE_CHECKING, ClassVar, cast
42# -----------------------------
43# Imports for other modules --
44from lsst.daf.butler import (
45 DataCoordinate,
46 DatasetType,
47 DimensionUniverse,
48 NamedValueSet,
49 Registry,
50 SkyPixDimension,
51)
52from lsst.resources import ResourcePath, ResourcePathExpression
53from lsst.utils import doImportType
54from lsst.utils.introspection import get_full_type_name
56from . import automatic_connection_constants as acc
57from . import pipelineIR, pipeTools
58from ._instrument import Instrument as PipeBaseInstrument
59from ._task_metadata import TaskMetadata
60from .config import PipelineTaskConfig
61from .connections import iterConnections
62from .connectionTypes import Input
63from .pipelineTask import PipelineTask
64from .task import _TASK_METADATA_TYPE
66if TYPE_CHECKING: # Imports needed only for type annotations; may be circular.
67 from lsst.obs.base import Instrument
68 from lsst.pex.config import Config
70# ----------------------------------
71# Local non-exported definitions --
72# ----------------------------------
74_LOG = logging.getLogger(__name__)
76# ------------------------
77# Exported definitions --
78# ------------------------
81@dataclass
82class LabelSpecifier:
83 """A structure to specify a subset of labels to load
85 This structure may contain a set of labels to be used in subsetting a
86 pipeline, or a beginning and end point. Beginning or end may be empty,
87 in which case the range will be a half open interval. Unlike python
88 iteration bounds, end bounds are *INCLUDED*. Note that range based
89 selection is not well defined for pipelines that are not linear in nature,
90 and correct behavior is not guaranteed, or may vary from run to run.
91 """
93 labels: set[str] | None = None
94 begin: str | None = None
95 end: str | None = None
97 def __post_init__(self) -> None:
98 if self.labels is not None and (self.begin or self.end):
99 raise ValueError(
100 "This struct can only be initialized with a labels set or a begin (and/or) end specifier"
101 )
104class TaskDef:
105 """TaskDef is a collection of information about task needed by Pipeline.
107 The information includes task name, configuration object and optional
108 task class. This class is just a collection of attributes and it exposes
109 all of them so that attributes could potentially be modified in place
110 (e.g. if configuration needs extra overrides).
112 Attributes
113 ----------
114 taskName : `str`, optional
115 The fully-qualified `PipelineTask` class name. If not provided,
116 ``taskClass`` must be.
117 config : `lsst.pipe.base.config.PipelineTaskConfig`, optional
118 Instance of the configuration class corresponding to this task class,
119 usually with all overrides applied. This config will be frozen. If
120 not provided, ``taskClass`` must be provided and
121 ``taskClass.ConfigClass()`` will be used.
122 taskClass : `type`, optional
123 `PipelineTask` class object; if provided and ``taskName`` is as well,
124 the caller guarantees that they are consistent. If not provided,
125 ``taskName`` is used to import the type.
126 label : `str`, optional
127 Task label, usually a short string unique in a pipeline. If not
128 provided, ``taskClass`` must be, and ``taskClass._DefaultName`` will
129 be used.
130 """
132 def __init__(
133 self,
134 taskName: str | None = None,
135 config: PipelineTaskConfig | None = None,
136 taskClass: type[PipelineTask] | None = None,
137 label: str | None = None,
138 ):
139 if taskName is None:
140 if taskClass is None:
141 raise ValueError("At least one of `taskName` and `taskClass` must be provided.")
142 taskName = get_full_type_name(taskClass)
143 elif taskClass is None:
144 taskClass = doImportType(taskName)
145 if config is None:
146 if taskClass is None:
147 raise ValueError("`taskClass` must be provided if `config` is not.")
148 config = taskClass.ConfigClass()
149 if label is None:
150 if taskClass is None:
151 raise ValueError("`taskClass` must be provided if `label` is not.")
152 label = taskClass._DefaultName
153 self.taskName = taskName
154 try:
155 config.validate()
156 except Exception:
157 _LOG.error("Configuration validation failed for task %s (%s)", label, taskName)
158 raise
159 config.freeze()
160 self.config = config
161 self.taskClass = taskClass
162 self.label = label
163 self.connections = config.connections.ConnectionsClass(config=config)
165 @property
166 def configDatasetName(self) -> str:
167 """Name of a dataset type for configuration of this task (`str`)"""
168 return acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.label)
170 @property
171 def metadataDatasetName(self) -> str:
172 """Name of a dataset type for metadata of this task (`str`)"""
173 return self.makeMetadataDatasetName(self.label)
175 @classmethod
176 def makeMetadataDatasetName(cls, label: str) -> str:
177 """Construct the name of the dataset type for metadata for a task.
179 Parameters
180 ----------
181 label : `str`
182 Label for the task within its pipeline.
184 Returns
185 -------
186 name : `str`
187 Name of the task's metadata dataset type.
188 """
189 return acc.METADATA_OUTPUT_TEMPLATE.format(label=label)
191 @property
192 def logOutputDatasetName(self) -> str | None:
193 """Name of a dataset type for log output from this task, `None` if
194 logs are not to be saved (`str`)
195 """
196 if self.config.saveLogOutput:
197 return acc.LOG_OUTPUT_TEMPLATE.format(label=self.label)
198 else:
199 return None
201 def __str__(self) -> str:
202 rep = "TaskDef(" + self.taskName
203 if self.label:
204 rep += ", label=" + self.label
205 rep += ")"
206 return rep
208 def __eq__(self, other: object) -> bool:
209 if not isinstance(other, TaskDef):
210 return False
211 # This does not consider equality of configs when determining equality
212 # as config equality is a difficult thing to define. Should be updated
213 # after DM-27847
214 return self.taskClass == other.taskClass and self.label == other.label
216 def __hash__(self) -> int:
217 return hash((self.taskClass, self.label))
219 @classmethod
220 def _unreduce(cls, taskName: str, config: PipelineTaskConfig, label: str) -> TaskDef:
221 """Unpickle pickle. Custom callable for unpickling.
223 All arguments are forwarded directly to the constructor; this
224 trampoline is only needed because ``__reduce__`` callables can't be
225 called with keyword arguments.
226 """
227 return cls(taskName=taskName, config=config, label=label)
229 def __reduce__(self) -> tuple[Callable[[str, PipelineTaskConfig, str], TaskDef], tuple[str, Config, str]]:
230 return (self._unreduce, (self.taskName, self.config, self.label))
233class Pipeline:
234 """A `Pipeline` is a representation of a series of tasks to run, and the
235 configuration for those tasks.
237 Parameters
238 ----------
239 description : `str`
240 A description of that this pipeline does.
241 """
243 def __init__(self, description: str):
244 pipeline_dict = {"description": description, "tasks": {}}
245 self._pipelineIR = pipelineIR.PipelineIR(pipeline_dict)
247 @classmethod
248 def fromFile(cls, filename: str) -> Pipeline:
249 """Load a pipeline defined in a pipeline yaml file.
251 Parameters
252 ----------
253 filename: `str`
254 A path that points to a pipeline defined in yaml format. This
255 filename may also supply additional labels to be used in
256 subsetting the loaded Pipeline. These labels are separated from
257 the path by a ``#``, and may be specified as a comma separated
258 list, or a range denoted as beginning..end. Beginning or end may
259 be empty, in which case the range will be a half open interval.
260 Unlike python iteration bounds, end bounds are *INCLUDED*. Note
261 that range based selection is not well defined for pipelines that
262 are not linear in nature, and correct behavior is not guaranteed,
263 or may vary from run to run.
265 Returns
266 -------
267 pipeline: `Pipeline`
268 The pipeline loaded from specified location with appropriate (if
269 any) subsetting.
271 Notes
272 -----
273 This method attempts to prune any contracts that contain labels which
274 are not in the declared subset of labels. This pruning is done using a
275 string based matching due to the nature of contracts and may prune more
276 than it should.
277 """
278 return cls.from_uri(filename)
280 @classmethod
281 def from_uri(cls, uri: ResourcePathExpression) -> Pipeline:
282 """Load a pipeline defined in a pipeline yaml file at a location
283 specified by a URI.
285 Parameters
286 ----------
287 uri : convertible to `~lsst.resources.ResourcePath`
288 If a string is supplied this should be a URI path that points to a
289 pipeline defined in yaml format, either as a direct path to the
290 yaml file, or as a directory containing a ``pipeline.yaml`` file
291 the form used by `write_to_uri` with ``expand=True``). This uri may
292 also supply additional labels to be used in subsetting the loaded
293 `Pipeline`. These labels are separated from the path by a ``#``,
294 and may be specified as a comma separated list, or a range denoted
295 as beginning..end. Beginning or end may be empty, in which case the
296 range will be a half open interval. Unlike python iteration bounds,
297 end bounds are *INCLUDED*. Note that range based selection is not
298 well defined for pipelines that are not linear in nature, and
299 correct behavior is not guaranteed, or may vary from run to run.
300 The same specifiers can be used with a
301 `~lsst.resources.ResourcePath` object, by being the sole contents
302 in the fragments attribute.
304 Returns
305 -------
306 pipeline : `Pipeline`
307 The pipeline loaded from specified location with appropriate (if
308 any) subsetting.
310 Notes
311 -----
312 This method attempts to prune any contracts that contain labels which
313 are not in the declared subset of labels. This pruning is done using a
314 string based matching due to the nature of contracts and may prune more
315 than it should.
316 """
317 # Split up the uri and any labels that were supplied
318 uri, label_specifier = cls._parse_file_specifier(uri)
319 pipeline: Pipeline = cls.fromIR(pipelineIR.PipelineIR.from_uri(uri))
321 # If there are labels supplied, only keep those
322 if label_specifier is not None:
323 pipeline = pipeline.subsetFromLabels(label_specifier)
324 return pipeline
326 def subsetFromLabels(self, labelSpecifier: LabelSpecifier) -> Pipeline:
327 """Subset a pipeline to contain only labels specified in labelSpecifier
329 Parameters
330 ----------
331 labelSpecifier : `labelSpecifier`
332 Object containing labels that describes how to subset a pipeline.
334 Returns
335 -------
336 pipeline : `Pipeline`
337 A new pipeline object that is a subset of the old pipeline
339 Raises
340 ------
341 ValueError
342 Raised if there is an issue with specified labels
344 Notes
345 -----
346 This method attempts to prune any contracts that contain labels which
347 are not in the declared subset of labels. This pruning is done using a
348 string based matching due to the nature of contracts and may prune more
349 than it should.
350 """
351 # Labels supplied as a set
352 if labelSpecifier.labels:
353 labelSet = labelSpecifier.labels
354 # Labels supplied as a range, first create a list of all the labels
355 # in the pipeline sorted according to task dependency. Then only
356 # keep labels that lie between the supplied bounds
357 else:
358 # Create a copy of the pipeline to use when assessing the label
359 # ordering. Use a dict for fast searching while preserving order.
360 # Remove contracts so they do not fail in the expansion step. This
361 # is needed because a user may only configure the tasks they intend
362 # to run, which may cause some contracts to fail if they will later
363 # be dropped
364 pipeline = copy.deepcopy(self)
365 pipeline._pipelineIR.contracts = []
366 labels = {taskdef.label: True for taskdef in pipeline.toExpandedPipeline()}
368 # Verify the bounds are in the labels
369 if labelSpecifier.begin is not None:
370 if labelSpecifier.begin not in labels:
371 raise ValueError(
372 f"Beginning of range subset, {labelSpecifier.begin}, not found in pipeline definition"
373 )
374 if labelSpecifier.end is not None:
375 if labelSpecifier.end not in labels:
376 raise ValueError(
377 f"End of range subset, {labelSpecifier.end}, not found in pipeline definition"
378 )
380 labelSet = set()
381 for label in labels:
382 if labelSpecifier.begin is not None:
383 if label != labelSpecifier.begin:
384 continue
385 else:
386 labelSpecifier.begin = None
387 labelSet.add(label)
388 if labelSpecifier.end is not None and label == labelSpecifier.end:
389 break
390 return Pipeline.fromIR(self._pipelineIR.subset_from_labels(labelSet))
392 @staticmethod
393 def _parse_file_specifier(uri: ResourcePathExpression) -> tuple[ResourcePath, LabelSpecifier | None]:
394 """Split appart a uri and any possible label subsets"""
395 if isinstance(uri, str):
396 # This is to support legacy pipelines during transition
397 uri, num_replace = re.subn("[:](?!\\/\\/)", "#", uri)
398 if num_replace:
399 raise ValueError(
400 f"The pipeline file {uri} seems to use the legacy :"
401 " to separate labels, please use # instead."
402 )
403 if uri.count("#") > 1:
404 raise ValueError("Only one set of labels is allowed when specifying a pipeline to load")
405 # Everything else can be converted directly to ResourcePath.
406 uri = ResourcePath(uri)
407 label_subset = uri.fragment or None
409 specifier: LabelSpecifier | None
410 if label_subset is not None:
411 label_subset = urllib.parse.unquote(label_subset)
412 args: dict[str, set[str] | str | None]
413 # labels supplied as a list
414 if "," in label_subset:
415 if ".." in label_subset:
416 raise ValueError(
417 "Can only specify a list of labels or a rangewhen loading a Pipline not both"
418 )
419 args = {"labels": set(label_subset.split(","))}
420 # labels supplied as a range
421 elif ".." in label_subset:
422 # Try to de-structure the labelSubset, this will fail if more
423 # than one range is specified
424 begin, end, *rest = label_subset.split("..")
425 if rest:
426 raise ValueError("Only one range can be specified when loading a pipeline")
427 args = {"begin": begin if begin else None, "end": end if end else None}
428 # Assume anything else is a single label
429 else:
430 args = {"labels": {label_subset}}
432 # MyPy doesn't like how cavalier kwarg construction is with types.
433 specifier = LabelSpecifier(**args) # type: ignore
434 else:
435 specifier = None
437 return uri, specifier
439 @classmethod
440 def fromString(cls, pipeline_string: str) -> Pipeline:
441 """Create a pipeline from string formatted as a pipeline document.
443 Parameters
444 ----------
445 pipeline_string : `str`
446 A string that is formatted according like a pipeline document
448 Returns
449 -------
450 pipeline: `Pipeline`
451 """
452 pipeline = cls.fromIR(pipelineIR.PipelineIR.from_string(pipeline_string))
453 return pipeline
455 @classmethod
456 def fromIR(cls, deserialized_pipeline: pipelineIR.PipelineIR) -> Pipeline:
457 """Create a pipeline from an already created `PipelineIR` object.
459 Parameters
460 ----------
461 deserialized_pipeline: `PipelineIR`
462 An already created pipeline intermediate representation object
464 Returns
465 -------
466 pipeline: `Pipeline`
467 """
468 pipeline = cls.__new__(cls)
469 pipeline._pipelineIR = deserialized_pipeline
470 return pipeline
472 @classmethod
473 def fromPipeline(cls, pipeline: Pipeline) -> Pipeline:
474 """Create a new pipeline by copying an already existing `Pipeline`.
476 Parameters
477 ----------
478 pipeline: `Pipeline`
479 An already created pipeline intermediate representation object
481 Returns
482 -------
483 pipeline: `Pipeline`
484 """
485 return cls.fromIR(copy.deepcopy(pipeline._pipelineIR))
487 def __str__(self) -> str:
488 return str(self._pipelineIR)
490 def mergePipeline(self, pipeline: Pipeline) -> None:
491 """Merge another in-memory `Pipeline` object into this one.
493 This merges another pipeline into this object, as if it were declared
494 in the import block of the yaml definition of this pipeline. This
495 modifies this pipeline in place.
497 Parameters
498 ----------
499 pipeline : `Pipeline`
500 The `Pipeline` object that is to be merged into this object.
501 """
502 self._pipelineIR.merge_pipelines((pipeline._pipelineIR,))
504 def addLabelToSubset(self, subset: str, label: str) -> None:
505 """Add a task label from the specified subset.
507 Parameters
508 ----------
509 subset : `str`
510 The labeled subset to modify
511 label : `str`
512 The task label to add to the specified subset.
514 Raises
515 ------
516 ValueError
517 Raised if the specified subset does not exist within the pipeline.
518 Raised if the specified label does not exist within the pipeline.
519 """
520 if label not in self._pipelineIR.tasks:
521 raise ValueError(f"Label {label} does not appear within the pipeline")
522 if subset not in self._pipelineIR.labeled_subsets:
523 raise ValueError(f"Subset {subset} does not appear within the pipeline")
524 self._pipelineIR.labeled_subsets[subset].subset.add(label)
526 def removeLabelFromSubset(self, subset: str, label: str) -> None:
527 """Remove a task label from the specified subset.
529 Parameters
530 ----------
531 subset : `str`
532 The labeled subset to modify
533 label : `str`
534 The task label to remove from the specified subset.
536 Raises
537 ------
538 ValueError
539 Raised if the specified subset does not exist in the pipeline.
540 Raised if the specified label does not exist within the specified
541 subset.
542 """
543 if subset not in self._pipelineIR.labeled_subsets:
544 raise ValueError(f"Subset {subset} does not appear within the pipeline")
545 if label not in self._pipelineIR.labeled_subsets[subset].subset:
546 raise ValueError(f"Label {label} does not appear within the pipeline")
547 self._pipelineIR.labeled_subsets[subset].subset.remove(label)
549 def findSubsetsWithLabel(self, label: str) -> set[str]:
550 """Find any subsets which may contain the specified label.
552 This function returns the name of subsets which return the specified
553 label. May return an empty set if there are no subsets, or no subsets
554 containing the specified label.
556 Parameters
557 ----------
558 label : `str`
559 The task label to use in membership check
561 Returns
562 -------
563 subsets : `set` of `str`
564 Returns a set (possibly empty) of subsets names which contain the
565 specified label.
567 Raises
568 ------
569 ValueError
570 Raised if the specified label does not exist within this pipeline.
571 """
572 results = set()
573 if label not in self._pipelineIR.tasks:
574 raise ValueError(f"Label {label} does not appear within the pipeline")
575 for subset in self._pipelineIR.labeled_subsets.values():
576 if label in subset.subset:
577 results.add(subset.label)
578 return results
580 def addInstrument(self, instrument: Instrument | str) -> None:
581 """Add an instrument to the pipeline, or replace an instrument that is
582 already defined.
584 Parameters
585 ----------
586 instrument : `~lsst.daf.butler.instrument.Instrument` or `str`
587 Either a derived class object of a `lsst.daf.butler.instrument` or
588 a string corresponding to a fully qualified
589 `lsst.daf.butler.instrument` name.
590 """
591 if isinstance(instrument, str):
592 pass
593 else:
594 # TODO: assume that this is a subclass of Instrument, no type
595 # checking
596 instrument = get_full_type_name(instrument)
597 self._pipelineIR.instrument = instrument
599 def getInstrument(self) -> str | None:
600 """Get the instrument from the pipeline.
602 Returns
603 -------
604 instrument : `str`, or None
605 The fully qualified name of a `lsst.obs.base.Instrument` subclass,
606 name, or None if the pipeline does not have an instrument.
607 """
608 return self._pipelineIR.instrument
610 def get_data_id(self, universe: DimensionUniverse) -> DataCoordinate:
611 """Return a data ID with all dimension constraints embedded in the
612 pipeline.
614 Parameters
615 ----------
616 universe : `lsst.daf.butler.DimensionUniverse`
617 Object that defines all dimensions.
619 Returns
620 -------
621 data_id : `lsst.daf.butler.DataCoordinate`
622 Data ID with all dimension constraints embedded in the
623 pipeline.
624 """
625 instrument_class_name = self._pipelineIR.instrument
626 if instrument_class_name is not None:
627 instrument_class = cast(PipeBaseInstrument, doImportType(instrument_class_name))
628 if instrument_class is not None:
629 return DataCoordinate.standardize(instrument=instrument_class.getName(), universe=universe)
630 return DataCoordinate.makeEmpty(universe)
632 def addTask(self, task: type[PipelineTask] | str, label: str) -> None:
633 """Add a new task to the pipeline, or replace a task that is already
634 associated with the supplied label.
636 Parameters
637 ----------
638 task: `PipelineTask` or `str`
639 Either a derived class object of a `PipelineTask` or a string
640 corresponding to a fully qualified `PipelineTask` name.
641 label: `str`
642 A label that is used to identify the `PipelineTask` being added
643 """
644 if isinstance(task, str):
645 taskName = task
646 elif issubclass(task, PipelineTask):
647 taskName = get_full_type_name(task)
648 else:
649 raise ValueError(
650 "task must be either a child class of PipelineTask or a string containing"
651 " a fully qualified name to one"
652 )
653 if not label:
654 # in some cases (with command line-generated pipeline) tasks can
655 # be defined without label which is not acceptable, use task
656 # _DefaultName in that case
657 if isinstance(task, str):
658 task_class = cast(PipelineTask, doImportType(task))
659 label = task_class._DefaultName
660 self._pipelineIR.tasks[label] = pipelineIR.TaskIR(label, taskName)
662 def removeTask(self, label: str) -> None:
663 """Remove a task from the pipeline.
665 Parameters
666 ----------
667 label : `str`
668 The label used to identify the task that is to be removed
670 Raises
671 ------
672 KeyError
673 If no task with that label exists in the pipeline
675 """
676 self._pipelineIR.tasks.pop(label)
678 def addConfigOverride(self, label: str, key: str, value: object) -> None:
679 """Apply single config override.
681 Parameters
682 ----------
683 label : `str`
684 Label of the task.
685 key: `str`
686 Fully-qualified field name.
687 value : object
688 Value to be given to a field.
689 """
690 self._addConfigImpl(label, pipelineIR.ConfigIR(rest={key: value}))
692 def addConfigFile(self, label: str, filename: str) -> None:
693 """Add overrides from a specified file.
695 Parameters
696 ----------
697 label : `str`
698 The label used to identify the task associated with config to
699 modify
700 filename : `str`
701 Path to the override file.
702 """
703 self._addConfigImpl(label, pipelineIR.ConfigIR(file=[filename]))
705 def addConfigPython(self, label: str, pythonString: str) -> None:
706 """Add Overrides by running a snippet of python code against a config.
708 Parameters
709 ----------
710 label : `str`
711 The label used to identity the task associated with config to
712 modify.
713 pythonString: `str`
714 A string which is valid python code to be executed. This is done
715 with config as the only local accessible value.
716 """
717 self._addConfigImpl(label, pipelineIR.ConfigIR(python=pythonString))
719 def _addConfigImpl(self, label: str, newConfig: pipelineIR.ConfigIR) -> None:
720 if label == "parameters":
721 self._pipelineIR.parameters.mapping.update(newConfig.rest)
722 if newConfig.file:
723 raise ValueError("Setting parameters section with config file is not supported")
724 if newConfig.python:
725 raise ValueError("Setting parameters section using python block in unsupported")
726 return
727 if label not in self._pipelineIR.tasks:
728 raise LookupError(f"There are no tasks labeled '{label}' in the pipeline")
729 self._pipelineIR.tasks[label].add_or_update_config(newConfig)
731 def write_to_uri(self, uri: ResourcePathExpression) -> None:
732 """Write the pipeline to a file or directory.
734 Parameters
735 ----------
736 uri : convertible to `~lsst.resources.ResourcePath`
737 URI to write to; may have any scheme with
738 `~lsst.resources.ResourcePath` write support or no scheme for a
739 local file/directory. Should have a ``.yaml`` extension.
740 """
741 self._pipelineIR.write_to_uri(uri)
743 def toExpandedPipeline(self) -> Generator[TaskDef, None, None]:
744 r"""Return a generator of `TaskDef`\s which can be used to create
745 quantum graphs.
747 Returns
748 -------
749 generator : generator of `TaskDef`
750 The generator returned will be the sorted iterator of tasks which
751 are to be used in constructing a quantum graph.
753 Raises
754 ------
755 NotImplementedError
756 If a dataId is supplied in a config block. This is in place for
757 future use
758 """
759 taskDefs = []
760 for label in self._pipelineIR.tasks:
761 taskDefs.append(self._buildTaskDef(label))
763 # lets evaluate the contracts
764 if self._pipelineIR.contracts is not None:
765 label_to_config = {x.label: x.config for x in taskDefs}
766 for contract in self._pipelineIR.contracts:
767 # execute this in its own line so it can raise a good error
768 # message if there was problems with the eval
769 success = eval(contract.contract, None, label_to_config)
770 if not success:
771 extra_info = f": {contract.msg}" if contract.msg is not None else ""
772 raise pipelineIR.ContractError(
773 f"Contract(s) '{contract.contract}' were not satisfied{extra_info}"
774 )
776 taskDefs = sorted(taskDefs, key=lambda x: x.label)
777 yield from pipeTools.orderPipeline(taskDefs)
779 def _buildTaskDef(self, label: str) -> TaskDef:
780 if (taskIR := self._pipelineIR.tasks.get(label)) is None:
781 raise NameError(f"Label {label} does not appear in this pipeline")
782 taskClass: type[PipelineTask] = doImportType(taskIR.klass)
783 taskName = get_full_type_name(taskClass)
784 config = taskClass.ConfigClass()
785 instrument: PipeBaseInstrument | None = None
786 if (instrumentName := self._pipelineIR.instrument) is not None:
787 instrument_cls: type = doImportType(instrumentName)
788 instrument = instrument_cls()
789 config.applyConfigOverrides(
790 instrument,
791 getattr(taskClass, "_DefaultName", ""),
792 taskIR.config,
793 self._pipelineIR.parameters,
794 label,
795 )
796 return TaskDef(taskName=taskName, config=config, taskClass=taskClass, label=label)
798 def __iter__(self) -> Generator[TaskDef, None, None]:
799 return self.toExpandedPipeline()
801 def __getitem__(self, item: str) -> TaskDef:
802 return self._buildTaskDef(item)
804 def __len__(self) -> int:
805 return len(self._pipelineIR.tasks)
807 def __eq__(self, other: object) -> bool:
808 if not isinstance(other, Pipeline):
809 return False
810 elif self._pipelineIR == other._pipelineIR:
811 # Shortcut: if the IR is the same, the expanded pipeline must be
812 # the same as well. But the converse is not true.
813 return True
814 else:
815 self_expanded = {td.label: (td.taskClass,) for td in self}
816 other_expanded = {td.label: (td.taskClass,) for td in other}
817 if self_expanded != other_expanded:
818 return False
819 # After DM-27847, we should compare configuration here, or better,
820 # delegated to TaskDef.__eq__ after making that compare configurations.
821 raise NotImplementedError(
822 "Pipelines cannot be compared because config instances cannot be compared; see DM-27847."
823 )
826@dataclass(frozen=True)
827class TaskDatasetTypes:
828 """An immutable struct that extracts and classifies the dataset types used
829 by a `PipelineTask`
830 """
832 initInputs: NamedValueSet[DatasetType]
833 """Dataset types that are needed as inputs in order to construct this Task.
835 Task-level `initInputs` may be classified as either
836 `~PipelineDatasetTypes.initInputs` or
837 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
838 """
840 initOutputs: NamedValueSet[DatasetType]
841 """Dataset types that may be written after constructing this Task.
843 Task-level `initOutputs` may be classified as either
844 `~PipelineDatasetTypes.initOutputs` or
845 `~PipelineDatasetTypes.initIntermediates` at the Pipeline level.
846 """
848 inputs: NamedValueSet[DatasetType]
849 """Dataset types that are regular inputs to this Task.
851 If an input dataset needed for a Quantum cannot be found in the input
852 collection(s) or produced by another Task in the Pipeline, that Quantum
853 (and all dependent Quanta) will not be produced.
855 Task-level `inputs` may be classified as either
856 `~PipelineDatasetTypes.inputs` or `~PipelineDatasetTypes.intermediates`
857 at the Pipeline level.
858 """
860 queryConstraints: NamedValueSet[DatasetType]
861 """Regular inputs that should not be used as constraints on the initial
862 QuantumGraph generation data ID query, according to their tasks
863 (`NamedValueSet`).
864 """
866 prerequisites: NamedValueSet[DatasetType]
867 """Dataset types that are prerequisite inputs to this Task.
869 Prerequisite inputs must exist in the input collection(s) before the
870 pipeline is run, but do not constrain the graph - if a prerequisite is
871 missing for a Quantum, `PrerequisiteMissingError` is raised.
873 Prerequisite inputs are not resolved until the second stage of
874 QuantumGraph generation.
875 """
877 outputs: NamedValueSet[DatasetType]
878 """Dataset types that are produced by this Task.
880 Task-level `outputs` may be classified as either
881 `~PipelineDatasetTypes.outputs` or `~PipelineDatasetTypes.intermediates`
882 at the Pipeline level.
883 """
885 @classmethod
886 def fromTaskDef(
887 cls,
888 taskDef: TaskDef,
889 *,
890 registry: Registry,
891 include_configs: bool = True,
892 storage_class_mapping: Mapping[str, str] | None = None,
893 ) -> TaskDatasetTypes:
894 """Extract and classify the dataset types from a single `PipelineTask`.
896 Parameters
897 ----------
898 taskDef: `TaskDef`
899 An instance of a `TaskDef` class for a particular `PipelineTask`.
900 registry: `Registry`
901 Registry used to construct normalized
902 `~lsst.daf.butler.DatasetType` objects and retrieve those that are
903 incomplete.
904 include_configs : `bool`, optional
905 If `True` (default) include config dataset types as
906 ``initOutputs``.
907 storage_class_mapping : `~collections.abc.Mapping` of `str` to \
908 `~lsst.daf.butler.StorageClass`, optional
909 If a taskdef contains a component dataset type that is unknown
910 to the registry, its parent `~lsst.daf.butler.StorageClass` will
911 be looked up in this mapping if it is supplied. If the mapping does
912 not contain the composite dataset type, or the mapping is not
913 supplied an exception will be raised.
915 Returns
916 -------
917 types: `TaskDatasetTypes`
918 The dataset types used by this task.
920 Raises
921 ------
922 ValueError
923 Raised if dataset type connection definition differs from
924 registry definition.
925 LookupError
926 Raised if component parent StorageClass could not be determined
927 and storage_class_mapping does not contain the composite type, or
928 is set to None.
929 """
931 def makeDatasetTypesSet(
932 connectionType: str,
933 is_input: bool,
934 freeze: bool = True,
935 ) -> NamedValueSet[DatasetType]:
936 """Construct a set of true `~lsst.daf.butler.DatasetType` objects.
938 Parameters
939 ----------
940 connectionType : `str`
941 Name of the connection type to produce a set for, corresponds
942 to an attribute of type `list` on the connection class instance
943 is_input : `bool`
944 These are input dataset types, else they are output dataset
945 types.
946 freeze : `bool`, optional
947 If `True`, call `NamedValueSet.freeze` on the object returned.
949 Returns
950 -------
951 datasetTypes : `NamedValueSet`
952 A set of all datasetTypes which correspond to the input
953 connection type specified in the connection class of this
954 `PipelineTask`
956 Raises
957 ------
958 ValueError
959 Raised if dataset type connection definition differs from
960 registry definition.
961 LookupError
962 Raised if component parent StorageClass could not be determined
963 and storage_class_mapping does not contain the composite type,
964 or is set to None.
966 Notes
967 -----
968 This function is a closure over the variables ``registry`` and
969 ``taskDef``, and ``storage_class_mapping``.
970 """
971 datasetTypes = NamedValueSet[DatasetType]()
972 for c in iterConnections(taskDef.connections, connectionType):
973 dimensions = set(getattr(c, "dimensions", set()))
974 if "skypix" in dimensions:
975 try:
976 datasetType = registry.getDatasetType(c.name)
977 except LookupError as err:
978 raise LookupError(
979 f"DatasetType '{c.name}' referenced by "
980 f"{type(taskDef.connections).__name__} uses 'skypix' as a dimension "
981 "placeholder, but does not already exist in the registry. "
982 "Note that reference catalog names are now used as the dataset "
983 "type name instead of 'ref_cat'."
984 ) from err
985 rest1 = set(registry.dimensions.extract(dimensions - set(["skypix"])).names)
986 rest2 = set(
987 dim.name for dim in datasetType.dimensions if not isinstance(dim, SkyPixDimension)
988 )
989 if rest1 != rest2:
990 raise ValueError(
991 f"Non-skypix dimensions for dataset type {c.name} declared in "
992 f"connections ({rest1}) are inconsistent with those in "
993 f"registry's version of this dataset ({rest2})."
994 )
995 else:
996 # Component dataset types are not explicitly in the
997 # registry. This complicates consistency checks with
998 # registry and requires we work out the composite storage
999 # class.
1000 registryDatasetType = None
1001 try:
1002 registryDatasetType = registry.getDatasetType(c.name)
1003 except KeyError:
1004 compositeName, componentName = DatasetType.splitDatasetTypeName(c.name)
1005 if componentName:
1006 if storage_class_mapping is None or compositeName not in storage_class_mapping:
1007 raise LookupError(
1008 "Component parent class cannot be determined, and "
1009 "composite name was not in storage class mapping, or no "
1010 "storage_class_mapping was supplied"
1011 )
1012 else:
1013 parentStorageClass = storage_class_mapping[compositeName]
1014 else:
1015 parentStorageClass = None
1016 datasetType = c.makeDatasetType(
1017 registry.dimensions, parentStorageClass=parentStorageClass
1018 )
1019 registryDatasetType = datasetType
1020 else:
1021 datasetType = c.makeDatasetType(
1022 registry.dimensions, parentStorageClass=registryDatasetType.parentStorageClass
1023 )
1025 if registryDatasetType and datasetType != registryDatasetType:
1026 # The dataset types differ but first check to see if
1027 # they are compatible before raising.
1028 if is_input:
1029 # This DatasetType must be compatible on get.
1030 is_compatible = datasetType.is_compatible_with(registryDatasetType)
1031 else:
1032 # Has to be able to be converted to expect type
1033 # on put.
1034 is_compatible = registryDatasetType.is_compatible_with(datasetType)
1035 if is_compatible:
1036 # For inputs we want the pipeline to use the
1037 # pipeline definition, for outputs it should use
1038 # the registry definition.
1039 if not is_input:
1040 datasetType = registryDatasetType
1041 _LOG.debug(
1042 "Dataset types differ (task %s != registry %s) but are compatible"
1043 " for %s in %s.",
1044 datasetType,
1045 registryDatasetType,
1046 "input" if is_input else "output",
1047 taskDef.label,
1048 )
1049 else:
1050 try:
1051 # Explicitly check for storage class just to
1052 # make more specific message.
1053 _ = datasetType.storageClass
1054 except KeyError:
1055 raise ValueError(
1056 "Storage class does not exist for supplied dataset type "
1057 f"{datasetType} for {taskDef.label}."
1058 ) from None
1059 raise ValueError(
1060 f"Supplied dataset type ({datasetType}) inconsistent with "
1061 f"registry definition ({registryDatasetType}) "
1062 f"for {taskDef.label}."
1063 )
1064 datasetTypes.add(datasetType)
1065 if freeze:
1066 datasetTypes.freeze()
1067 return datasetTypes
1069 # optionally add initOutput dataset for config
1070 initOutputs = makeDatasetTypesSet("initOutputs", is_input=False, freeze=False)
1071 if include_configs:
1072 initOutputs.add(
1073 DatasetType(
1074 taskDef.configDatasetName,
1075 registry.dimensions.empty,
1076 storageClass="Config",
1077 )
1078 )
1079 initOutputs.freeze()
1081 # optionally add output dataset for metadata
1082 outputs = makeDatasetTypesSet("outputs", is_input=False, freeze=False)
1084 # Metadata is supposed to be of the TaskMetadata type, its dimensions
1085 # correspond to a task quantum.
1086 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
1088 # Allow the storage class definition to be read from the existing
1089 # dataset type definition if present.
1090 try:
1091 current = registry.getDatasetType(taskDef.metadataDatasetName)
1092 except KeyError:
1093 # No previous definition so use the default.
1094 storageClass = "TaskMetadata" if _TASK_METADATA_TYPE is TaskMetadata else "PropertySet"
1095 else:
1096 storageClass = current.storageClass.name
1097 outputs.update({DatasetType(taskDef.metadataDatasetName, dimensions, storageClass)})
1099 if taskDef.logOutputDatasetName is not None:
1100 # Log output dimensions correspond to a task quantum.
1101 dimensions = registry.dimensions.extract(taskDef.connections.dimensions)
1102 outputs.update({DatasetType(taskDef.logOutputDatasetName, dimensions, "ButlerLogRecords")})
1104 outputs.freeze()
1106 inputs = makeDatasetTypesSet("inputs", is_input=True)
1107 queryConstraints = NamedValueSet(
1108 inputs[c.name]
1109 for c in cast(Iterable[Input], iterConnections(taskDef.connections, "inputs"))
1110 if not c.deferGraphConstraint
1111 )
1113 return cls(
1114 initInputs=makeDatasetTypesSet("initInputs", is_input=True),
1115 initOutputs=initOutputs,
1116 inputs=inputs,
1117 queryConstraints=queryConstraints,
1118 prerequisites=makeDatasetTypesSet("prerequisiteInputs", is_input=True),
1119 outputs=outputs,
1120 )
1123@dataclass(frozen=True)
1124class PipelineDatasetTypes:
1125 """An immutable struct that classifies the dataset types used in a
1126 `Pipeline`.
1127 """
1129 packagesDatasetName: ClassVar[str] = "packages"
1130 """Name of a dataset type used to save package versions.
1131 """
1133 initInputs: NamedValueSet[DatasetType]
1134 """Dataset types that are needed as inputs in order to construct the Tasks
1135 in this Pipeline.
1137 This does not include dataset types that are produced when constructing
1138 other Tasks in the Pipeline (these are classified as `initIntermediates`).
1139 """
1141 initOutputs: NamedValueSet[DatasetType]
1142 """Dataset types that may be written after constructing the Tasks in this
1143 Pipeline.
1145 This does not include dataset types that are also used as inputs when
1146 constructing other Tasks in the Pipeline (these are classified as
1147 `initIntermediates`).
1148 """
1150 initIntermediates: NamedValueSet[DatasetType]
1151 """Dataset types that are both used when constructing one or more Tasks
1152 in the Pipeline and produced as a side-effect of constructing another
1153 Task in the Pipeline.
1154 """
1156 inputs: NamedValueSet[DatasetType]
1157 """Dataset types that are regular inputs for the full pipeline.
1159 If an input dataset needed for a Quantum cannot be found in the input
1160 collection(s), that Quantum (and all dependent Quanta) will not be
1161 produced.
1162 """
1164 queryConstraints: NamedValueSet[DatasetType]
1165 """Regular inputs that should be used as constraints on the initial
1166 QuantumGraph generation data ID query, according to their tasks
1167 (`NamedValueSet`).
1168 """
1170 prerequisites: NamedValueSet[DatasetType]
1171 """Dataset types that are prerequisite inputs for the full Pipeline.
1173 Prerequisite inputs must exist in the input collection(s) before the
1174 pipeline is run, but do not constrain the graph - if a prerequisite is
1175 missing for a Quantum, `PrerequisiteMissingError` is raised.
1177 Prerequisite inputs are not resolved until the second stage of
1178 QuantumGraph generation.
1179 """
1181 intermediates: NamedValueSet[DatasetType]
1182 """Dataset types that are output by one Task in the Pipeline and consumed
1183 as inputs by one or more other Tasks in the Pipeline.
1184 """
1186 outputs: NamedValueSet[DatasetType]
1187 """Dataset types that are output by a Task in the Pipeline and not consumed
1188 by any other Task in the Pipeline.
1189 """
1191 byTask: Mapping[str, TaskDatasetTypes]
1192 """Per-Task dataset types, keyed by label in the `Pipeline`.
1194 This is guaranteed to be zip-iterable with the `Pipeline` itself (assuming
1195 neither has been modified since the dataset types were extracted, of
1196 course).
1197 """
1199 @classmethod
1200 def fromPipeline(
1201 cls,
1202 pipeline: Pipeline | Iterable[TaskDef],
1203 *,
1204 registry: Registry,
1205 include_configs: bool = True,
1206 include_packages: bool = True,
1207 ) -> PipelineDatasetTypes:
1208 """Extract and classify the dataset types from all tasks in a
1209 `Pipeline`.
1211 Parameters
1212 ----------
1213 pipeline: `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
1214 A collection of tasks that can be run together.
1215 registry: `Registry`
1216 Registry used to construct normalized
1217 `~lsst.daf.butler.DatasetType` objects and retrieve those that are
1218 incomplete.
1219 include_configs : `bool`, optional
1220 If `True` (default) include config dataset types as
1221 ``initOutputs``.
1222 include_packages : `bool`, optional
1223 If `True` (default) include the dataset type for software package
1224 versions in ``initOutputs``.
1226 Returns
1227 -------
1228 types: `PipelineDatasetTypes`
1229 The dataset types used by this `Pipeline`.
1231 Raises
1232 ------
1233 ValueError
1234 Raised if Tasks are inconsistent about which datasets are marked
1235 prerequisite. This indicates that the Tasks cannot be run as part
1236 of the same `Pipeline`.
1237 """
1238 allInputs = NamedValueSet[DatasetType]()
1239 allOutputs = NamedValueSet[DatasetType]()
1240 allInitInputs = NamedValueSet[DatasetType]()
1241 allInitOutputs = NamedValueSet[DatasetType]()
1242 prerequisites = NamedValueSet[DatasetType]()
1243 queryConstraints = NamedValueSet[DatasetType]()
1244 byTask = dict()
1245 if include_packages:
1246 allInitOutputs.add(
1247 DatasetType(
1248 cls.packagesDatasetName,
1249 registry.dimensions.empty,
1250 storageClass="Packages",
1251 )
1252 )
1253 # create a list of TaskDefs in case the input is a generator
1254 pipeline = list(pipeline)
1256 # collect all the output dataset types
1257 typeStorageclassMap: dict[str, str] = {}
1258 for taskDef in pipeline:
1259 for outConnection in iterConnections(taskDef.connections, "outputs"):
1260 typeStorageclassMap[outConnection.name] = outConnection.storageClass
1262 for taskDef in pipeline:
1263 thisTask = TaskDatasetTypes.fromTaskDef(
1264 taskDef,
1265 registry=registry,
1266 include_configs=include_configs,
1267 storage_class_mapping=typeStorageclassMap,
1268 )
1269 allInitInputs.update(thisTask.initInputs)
1270 allInitOutputs.update(thisTask.initOutputs)
1271 allInputs.update(thisTask.inputs)
1272 # Inputs are query constraints if any task considers them a query
1273 # constraint.
1274 queryConstraints.update(thisTask.queryConstraints)
1275 prerequisites.update(thisTask.prerequisites)
1276 allOutputs.update(thisTask.outputs)
1277 byTask[taskDef.label] = thisTask
1278 if not prerequisites.isdisjoint(allInputs):
1279 raise ValueError(
1280 "{} marked as both prerequisites and regular inputs".format(
1281 {dt.name for dt in allInputs & prerequisites}
1282 )
1283 )
1284 if not prerequisites.isdisjoint(allOutputs):
1285 raise ValueError(
1286 "{} marked as both prerequisites and outputs".format(
1287 {dt.name for dt in allOutputs & prerequisites}
1288 )
1289 )
1290 # Make sure that components which are marked as inputs get treated as
1291 # intermediates if there is an output which produces the composite
1292 # containing the component
1293 intermediateComponents = NamedValueSet[DatasetType]()
1294 intermediateComposites = NamedValueSet[DatasetType]()
1295 outputNameMapping = {dsType.name: dsType for dsType in allOutputs}
1296 for dsType in allInputs:
1297 # get the name of a possible component
1298 name, component = dsType.nameAndComponent()
1299 # if there is a component name, that means this is a component
1300 # DatasetType, if there is an output which produces the parent of
1301 # this component, treat this input as an intermediate
1302 if component is not None:
1303 # This needs to be in this if block, because someone might have
1304 # a composite that is a pure input from existing data
1305 if name in outputNameMapping:
1306 intermediateComponents.add(dsType)
1307 intermediateComposites.add(outputNameMapping[name])
1309 def checkConsistency(a: NamedValueSet, b: NamedValueSet) -> None:
1310 common = a.names & b.names
1311 for name in common:
1312 # Any compatibility is allowed. This function does not know
1313 # if a dataset type is to be used for input or output.
1314 if not (a[name].is_compatible_with(b[name]) or b[name].is_compatible_with(a[name])):
1315 raise ValueError(f"Conflicting definitions for dataset type: {a[name]} != {b[name]}.")
1317 checkConsistency(allInitInputs, allInitOutputs)
1318 checkConsistency(allInputs, allOutputs)
1319 checkConsistency(allInputs, intermediateComposites)
1320 checkConsistency(allOutputs, intermediateComposites)
1322 def frozen(s: Set[DatasetType]) -> NamedValueSet[DatasetType]:
1323 assert isinstance(s, NamedValueSet)
1324 s.freeze()
1325 return s
1327 inputs = frozen(allInputs - allOutputs - intermediateComponents)
1329 return cls(
1330 initInputs=frozen(allInitInputs - allInitOutputs),
1331 initIntermediates=frozen(allInitInputs & allInitOutputs),
1332 initOutputs=frozen(allInitOutputs - allInitInputs),
1333 inputs=inputs,
1334 queryConstraints=frozen(queryConstraints & inputs),
1335 # If there are storage class differences in inputs and outputs
1336 # the intermediates have to choose priority. Here choose that
1337 # inputs to tasks much match the requested storage class by
1338 # applying the inputs over the top of the outputs.
1339 intermediates=frozen(allOutputs & allInputs | intermediateComponents),
1340 outputs=frozen(allOutputs - allInputs - intermediateComposites),
1341 prerequisites=frozen(prerequisites),
1342 byTask=MappingProxyType(byTask), # MappingProxyType -> frozen view of dict for immutability
1343 )
1345 @classmethod
1346 def initOutputNames(
1347 cls,
1348 pipeline: Pipeline | Iterable[TaskDef],
1349 *,
1350 include_configs: bool = True,
1351 include_packages: bool = True,
1352 ) -> Iterator[str]:
1353 """Return the names of dataset types ot task initOutputs, Configs,
1354 and package versions for a pipeline.
1356 Parameters
1357 ----------
1358 pipeline: `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
1359 A `Pipeline` instance or collection of `TaskDef` instances.
1360 include_configs : `bool`, optional
1361 If `True` (default) include config dataset types.
1362 include_packages : `bool`, optional
1363 If `True` (default) include the dataset type for package versions.
1365 Yields
1366 ------
1367 datasetTypeName : `str`
1368 Name of the dataset type.
1369 """
1370 if include_packages:
1371 # Package versions dataset type
1372 yield cls.packagesDatasetName
1374 if isinstance(pipeline, Pipeline):
1375 pipeline = pipeline.toExpandedPipeline()
1377 for taskDef in pipeline:
1378 # all task InitOutputs
1379 for name in taskDef.connections.initOutputs:
1380 attribute = getattr(taskDef.connections, name)
1381 yield attribute.name
1383 # config dataset name
1384 if include_configs:
1385 yield taskDef.configDatasetName