Coverage for python/lsst/pipe/base/graphBuilder.py : 27%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from dataclasses import dataclass
34from typing import Set, List, Dict, Optional, Iterable
35import logging
37# -----------------------------
38# Imports for other modules --
39# -----------------------------
40from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
41from .graph import QuantumGraph, QuantumGraphTaskNodes
42from lsst.daf.butler import (
43 DatasetRef,
44 DatasetType,
45 DimensionGraph,
46 DimensionUniverse,
47 ExpandedDataCoordinate,
48 Quantum,
49)
50from lsst.daf.butler.core.utils import NamedKeyDict
52# ----------------------------------
53# Local non-exported definitions --
54# ----------------------------------
56_LOG = logging.getLogger(__name__.partition(".")[2])
59@dataclass
60class _DatasetScaffolding:
61 """Helper class aggregating information about a `DatasetType`, used when
62 constructing a `QuantumGraph`.
64 `_DatasetScaffolding` does not hold the `DatasetType` instance itself
65 because it is usually used as the value type in `_DatasetScaffoldingDict`,
66 which uses `DatasetType` instances as keys.
68 See `_PipelineScaffolding` for a top-down description of the full
69 scaffolding data structure.
71 Parameters
72 ----------
73 dimensions : `DimensionGraph`
74 Dimensions of the `DatasetType`.
75 """
76 def __init__(self, dimensions: DimensionGraph):
77 self.dimensions = dimensions
78 self.producer = None
79 self.consumers = {}
80 self.dataIds = set()
81 self.refs = []
83 __slots__ = ("dimensions", "producer", "consumers", "dataIds", "refs")
85 def __repr__(self):
86 # Default dataclass-injected __repr__ gets caught in an infinite loop
87 # because of back-references.
88 return f"_DatasetScaffolding(dimensions={self.dimensions}, ...)"
90 dimensions: DimensionGraph
91 """The dimensions of the dataset type (`DimensionGraph`).
93 Set during `_PipelineScaffolding` construction.
94 """
96 producer: Optional[_TaskScaffolding]
97 """The scaffolding objects for the Task that produces this dataset.
99 Set during `_PipelineScaffolding` construction.
100 """
102 consumers: Dict[str, _TaskScaffolding]
103 """The scaffolding objects for the Tasks that consume this dataset,
104 keyed by their label in the `Pipeline`.
106 Set during `_PipelineScaffolding` construction.
107 """
109 dataIds: Set[ExpandedDataCoordinate]
110 """Data IDs for all instances of this dataset type in the graph.
112 Populated after construction by `_PipelineScaffolding.fillDataIds`.
113 """
115 refs: List[DatasetRef]
116 """References for all instances of this dataset type in the graph.
118 Populated after construction by `_PipelineScaffolding.fillDatasetRefs`.
119 """
122class _DatasetScaffoldingDict(NamedKeyDict):
123 """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`.
125 See `_PipelineScaffolding` for a top-down description of the full
126 scaffolding data structure.
128 Parameters
129 ----------
130 args
131 Positional arguments are forwarded to the `dict` constructor.
132 universe : `DimensionUniverse`
133 Universe of all possible dimensions.
134 """
135 def __init__(self, *args, universe: DimensionGraph):
136 super().__init__(*args)
137 self.universe = universe
139 @classmethod
140 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
141 universe: DimensionUniverse) -> _DatasetScaffoldingDict:
142 """Construct a a dictionary from a flat iterable of `DatasetType` keys.
144 Parameters
145 ----------
146 datasetTypes : `iterable` of `DatasetType`
147 DatasetTypes to use as keys for the dict. Values will be
148 constructed from the dimensions of the keys.
149 universe : `DimensionUniverse`
150 Universe of all possible dimensions.
152 Returns
153 -------
154 dictionary : `_DatasetScaffoldingDict`
155 A new dictionary instance.
156 """
157 return cls(((datasetType, _DatasetScaffolding(datasetType.dimensions))
158 for datasetType in datasetTypes),
159 universe=universe)
161 @classmethod
162 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
163 *rest) -> _DatasetScaffoldingDict:
164 """Return a new dictionary by extracting items corresponding to the
165 given keys from one or more existing dictionaries.
167 Parameters
168 ----------
169 datasetTypes : `iterable` of `DatasetType`
170 DatasetTypes to use as keys for the dict. Values will be obtained
171 by lookups against ``first`` and ``rest``.
172 first : `_DatasetScaffoldingDict`
173 Another dictionary from which to extract values.
174 rest
175 Additional dictionaries from which to extract values.
177 Returns
178 -------
179 dictionary : `_DatasetScaffoldingDict`
180 A new dictionary instance.
181 """
182 combined = ChainMap(first, *rest)
183 return cls(((datasetType, combined[datasetType]) for datasetType in datasetTypes),
184 universe=first.universe)
186 @property
187 def dimensions(self) -> DimensionGraph:
188 """The union of all dimensions used by all dataset types in this
189 dictionary, including implied dependencies (`DimensionGraph`).
190 """
191 base = self.universe.empty
192 if len(self) == 0:
193 return base
194 return base.union(*[scaffolding.dimensions for scaffolding in self.values()])
196 def unpackRefs(self) -> NamedKeyDict:
197 """Unpack nested single-element `DatasetRef` lists into a new
198 dictionary.
200 This method assumes that each `_DatasetScaffolding.refs` list contains
201 exactly one `DatasetRef`, as is the case for all "init" datasets.
203 Returns
204 -------
205 dictionary : `NamedKeyDict`
206 Dictionary mapping `DatasetType` to `DatasetRef`, with both
207 `DatasetType` instances and string names usable as keys.
208 """
209 return NamedKeyDict((datasetType, scaffolding.refs[0]) for datasetType, scaffolding in self.items())
212@dataclass
213class _TaskScaffolding:
214 """Helper class aggregating information about a `PipelineTask`, used when
215 constructing a `QuantumGraph`.
217 See `_PipelineScaffolding` for a top-down description of the full
218 scaffolding data structure.
220 Parameters
221 ----------
222 taskDef : `TaskDef`
223 Data structure that identifies the task class and its config.
224 parent : `_PipelineScaffolding`
225 The parent data structure that will hold the instance being
226 constructed.
227 datasetTypes : `TaskDatasetTypes`
228 Data structure that categorizes the dataset types used by this task.
230 Raises
231 ------
232 GraphBuilderError
233 Raised if the task's dimensions are not a subset of the union of the
234 pipeline's dataset dimensions.
235 """
236 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
237 universe = parent.dimensions.universe
238 self.taskDef = taskDef
239 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
240 if not self.dimensions.issubset(parent.dimensions):
241 raise GraphBuilderError(f"Task with label '{taskDef.label}' has dimensions "
242 f"{self.dimensions} that are not a subset of "
243 f"the pipeline dimensions {parent.dimensions}.")
245 # Initialize _DatasetScaffoldingDicts as subsets of the one or two
246 # corresponding dicts in the parent _PipelineScaffolding.
247 self.initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
248 parent.initInputs, parent.initIntermediates)
249 self.initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
250 parent.initIntermediates, parent.initOutputs)
251 self.inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
252 parent.inputs, parent.intermediates)
253 self.outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
254 parent.intermediates, parent.outputs)
255 self.prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
256 parent.prerequisites)
257 # Add backreferences to the _DatasetScaffolding objects that point to
258 # this Task.
259 for dataset in itertools.chain(self.initInputs.values(), self.inputs.values(),
260 self.prerequisites.values()):
261 dataset.consumers[self.taskDef.label] = self
262 for dataset in itertools.chain(self.initOutputs.values(), self.outputs.values()):
263 assert dataset.producer is None
264 dataset.producer = self
265 self.dataIds = set()
266 self.quanta = []
268 def __repr__(self):
269 # Default dataclass-injected __repr__ gets caught in an infinite loop
270 # because of back-references.
271 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
273 taskDef: TaskDef
274 """Data structure that identifies the task class and its config
275 (`TaskDef`).
276 """
278 dimensions: DimensionGraph
279 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
280 """
282 initInputs: _DatasetScaffoldingDict
283 """Dictionary containing information about datasets used to construct this
284 task (`_DatasetScaffoldingDict`).
285 """
287 initOutputs: _DatasetScaffoldingDict
288 """Dictionary containing information about datasets produced as a
289 side-effect of constructing this task (`_DatasetScaffoldingDict`).
290 """
292 inputs: _DatasetScaffoldingDict
293 """Dictionary containing information about datasets used as regular,
294 graph-constraining inputs to this task (`_DatasetScaffoldingDict`).
295 """
297 outputs: _DatasetScaffoldingDict
298 """Dictionary containing information about datasets produced by this task
299 (`_DatasetScaffoldingDict`).
300 """
302 prerequisites: _DatasetScaffoldingDict
303 """Dictionary containing information about input datasets that must be
304 present in the repository before any Pipeline containing this task is run
305 (`_DatasetScaffoldingDict`).
306 """
308 dataIds: Set[ExpandedDataCoordinate]
309 """Data IDs for all quanta for this task in the graph (`set` of
310 `ExpandedDataCoordinate`).
312 Populated after construction by `_PipelineScaffolding.fillDataIds`.
313 """
315 quanta: List[Quantum]
316 """All quanta for this task in the graph (`list` of `Quantum`).
318 Populated after construction by `_PipelineScaffolding.fillQuanta`.
319 """
321 def addQuantum(self, quantum: Quantum):
322 config = self.taskDef.config
323 connectionClass = config.connections.ConnectionsClass
324 connectionInstance = connectionClass(config=config)
325 # This will raise if one of the check conditions is not met, which is the intended
326 # behavior
327 result = connectionInstance.adjustQuantum(quantum.predictedInputs)
328 quantum._predictedInputs = NamedKeyDict(result)
330 # If this function has reached this far add the quantum
331 self.quanta.append(quantum)
333 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
334 """Create a `QuantumGraphTaskNodes` instance from the information in
335 ``self``.
337 Returns
338 -------
339 nodes : `QuantumGraphTaskNodes`
340 The `QuantumGraph` elements corresponding to this task.
341 """
342 return QuantumGraphTaskNodes(
343 taskDef=self.taskDef,
344 quanta=self.quanta,
345 initInputs=self.initInputs.unpackRefs(),
346 initOutputs=self.initOutputs.unpackRefs(),
347 )
350@dataclass
351class _PipelineScaffolding:
352 """A helper data structure that organizes the information involved in
353 constructing a `QuantumGraph` for a `Pipeline`.
355 Parameters
356 ----------
357 pipeline : `Pipeline`
358 Sequence of tasks from which a graph is to be constructed. Must
359 have nested task classes already imported.
360 universe : `DimensionUniverse`
361 Universe of all possible dimensions.
363 Raises
364 ------
365 GraphBuilderError
366 Raised if the task's dimensions are not a subset of the union of the
367 pipeline's dataset dimensions.
369 Notes
370 -----
371 The scaffolding data structure contains nested data structures for both
372 tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the
373 latter held by `_DatasetScaffoldingDict`. The dataset data structures are
374 shared between the pipeline-level structure (which aggregates all datasets
375 and categorizes them from the perspective of the complete pipeline) and the
376 individual tasks that use them as inputs and outputs.
378 `QuantumGraph` construction proceeds in five steps, with each corresponding
379 to a different `_PipelineScaffolding` method:
381 1. When `_PipelineScaffolding` is constructed, we extract and categorize
382 the DatasetTypes used by the pipeline (delegating to
383 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
384 nested `_TaskScaffolding` and `_DatasetScaffolding` objects.
386 2. In `fillDataIds`, we construct and run the "Big Join Query", which
387 returns related tuples of all dimensions used to identify any regular
388 input, output, and intermediate datasets (not prerequisites). We then
389 iterate over these tuples of related dimensions, identifying the subsets
390 that correspond to distinct data IDs for each task and dataset type.
392 3. In `fillDatasetRefs`, we run follow-up queries against all of the
393 dataset data IDs previously identified, populating the
394 `_DatasetScaffolding.refs` lists - except for those for prerequisite
395 datasets, which cannot be resolved until distinct quanta are
396 identified.
398 4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into
399 the inputs and outputs for each `Quantum` and search for prerequisite
400 datasets, populating `_TaskScaffolding.quanta`.
402 5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
403 per-task quanta identified in the previous step.
404 """
405 def __init__(self, pipeline, *, registry):
406 self.tasks = []
407 # Aggregate and categorize the DatasetTypes in the Pipeline.
408 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
409 # Construct dictionaries that map those DatasetTypes to structures
410 # that will (later) hold addiitonal information about them.
411 for attr in ("initInputs", "initIntermediates", "initOutputs",
412 "inputs", "intermediates", "outputs", "prerequisites"):
413 setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
414 universe=registry.dimensions))
415 # Aggregate all dimensions for all non-init, non-prerequisite
416 # DatasetTypes. These are the ones we'll include in the big join query.
417 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
418 self.outputs.dimensions)
419 # Construct scaffolding nodes for each Task, and add backreferences
420 # to the Task from each DatasetScaffolding node.
421 # Note that there's only one scaffolding node for each DatasetType, shared by
422 # _PipelineScaffolding and all _TaskScaffoldings that reference it.
423 if isinstance(pipeline, Pipeline):
424 pipeline = pipeline.toExpandedPipeline()
425 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
426 for taskDef, taskDatasetTypes in zip(pipeline,
427 datasetTypes.byTask.values())]
429 def __repr__(self):
430 # Default dataclass-injected __repr__ gets caught in an infinite loop
431 # because of back-references.
432 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
434 tasks: List[_TaskScaffolding]
435 """Scaffolding data structures for each task in the pipeline
436 (`list` of `_TaskScaffolding`).
437 """
439 initInputs: _DatasetScaffoldingDict
440 """Datasets consumed but not produced when constructing the tasks in this
441 pipeline (`_DatasetScaffoldingDict`).
442 """
444 initIntermediates: _DatasetScaffoldingDict
445 """Datasets that are both consumed and produced when constructing the tasks
446 in this pipeline (`_DatasetScaffoldingDict`).
447 """
449 initOutputs: _DatasetScaffoldingDict
450 """Datasets produced but not consumed when constructing the tasks in this
451 pipeline (`_DatasetScaffoldingDict`).
452 """
454 inputs: _DatasetScaffoldingDict
455 """Datasets that are consumed but not produced when running this pipeline
456 (`_DatasetScaffoldingDict`).
457 """
459 intermediates: _DatasetScaffoldingDict
460 """Datasets that are both produced and consumed when running this pipeline
461 (`_DatasetScaffoldingDict`).
462 """
464 outputs: _DatasetScaffoldingDict
465 """Datasets produced but not consumed when when running this pipeline
466 (`_DatasetScaffoldingDict`).
467 """
469 prerequisites: _DatasetScaffoldingDict
470 """Datasets that are consumed when running this pipeline and looked up
471 per-Quantum when generating the graph (`_DatasetScaffoldingDict`).
472 """
474 dimensions: DimensionGraph
475 """All dimensions used by any regular input, intermediate, or output
476 (not prerequisite) dataset; the set of dimension used in the "Big Join
477 Query" (`DimensionGraph`).
479 This is required to be a superset of all task quantum dimensions.
480 """
482 def fillDataIds(self, registry, collections, userQuery):
483 """Query for the data IDs that connect nodes in the `QuantumGraph`.
485 This method populates `_TaskScaffolding.dataIds` and
486 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
488 Parameters
489 ----------
490 registry : `lsst.daf.butler.Registry`
491 Registry for the data repository; used for all data ID queries.
492 collections : `lsst.daf.butler.CollectionSearch`
493 Object representing the collections to search for input datasets.
494 userQuery : `str`, optional
495 User-provided expression to limit the data IDs processed.
496 """
497 # Initialization datasets always have empty data IDs.
498 emptyDataId = ExpandedDataCoordinate(registry.dimensions.empty, (), records={})
499 for scaffolding in itertools.chain(self.initInputs.values(),
500 self.initIntermediates.values(),
501 self.initOutputs.values()):
502 scaffolding.dataIds.add(emptyDataId)
503 # Run one big query for the data IDs for task dimensions and regular
504 # inputs and outputs. We limit the query to only dimensions that are
505 # associated with the input dataset types, but don't (yet) try to
506 # obtain the dataset_ids for those inputs.
507 resultIter = registry.queryDimensions(
508 self.dimensions,
509 datasets=list(self.inputs),
510 collections=collections,
511 where=userQuery,
512 )
513 # Iterate over query results and populate the data IDs in
514 # self._TaskScaffolding.refs, extracting the subsets of the common data
515 # ID from the query corresponding to the dimensions of each. By using
516 # sets, we remove duplicates caused by query rows in which the
517 # dimensions that change are not relevant for that task or dataset
518 # type. For example, if the Big Join Query involves the dimensions
519 # (instrument, visit, detector, skymap, tract, patch), we extract
520 # "calexp" data IDs from the instrument, visit, and detector values
521 # only, and rely on `set.add` to avoid duplications due to result rows
522 # in which only skymap, tract, and patch are varying. The Big Join
523 # Query is defined such that only visit+detector and tract+patch
524 # combinations that represent spatial overlaps are included in the
525 # results.
526 for commonDataId in resultIter:
527 for taskScaffolding in self.tasks:
528 taskScaffolding.dataIds.add(commonDataId.subset(taskScaffolding.dimensions))
529 for datasetType, scaffolding in itertools.chain(self.inputs.items(),
530 self.intermediates.items(),
531 self.outputs.items()):
532 scaffolding.dataIds.add(commonDataId.subset(scaffolding.dimensions))
534 def fillDatasetRefs(self, registry, collections, run, *, skipExisting=True):
535 """Perform follow up queries for each dataset data ID produced in
536 `fillDataIds`.
538 This method populates `_DatasetScaffolding.refs` (except for those in
539 `prerequisites`).
541 Parameters
542 ----------
543 registry : `lsst.daf.butler.Registry`
544 Registry for the data repository; used for all data ID queries.
545 collections : `lsst.daf.butler.CollectionSearch`
546 Object representing the collections to search for input datasets.
547 run : `str`, optional
548 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
549 output datasets, if it already exists.
550 skipExisting : `bool`, optional
551 If `True` (default), a Quantum is not created if all its outputs
552 already exist in ``run``. Ignored if ``run`` is `None`.
554 Raises
555 ------
556 OutputExistsError
557 Raised if an output dataset already exists in the output run
558 and ``skipExisting`` is `False`. The case where some but not all
559 of a quantum's outputs are present and ``skipExisting`` is `True`
560 cannot be identified at this stage, and is handled by `fillQuanta`
561 instead.
562 """
563 # Look up input and initInput datasets in the input collection(s).
564 for datasetType, scaffolding in itertools.chain(self.initInputs.items(), self.inputs.items()):
565 for dataId in scaffolding.dataIds:
566 refs = list(
567 registry.queryDatasets(
568 datasetType,
569 collections=collections,
570 dataId=dataId,
571 deduplicate=True,
572 expand=True,
573 )
574 )
575 if len(refs) != 1:
576 raise RuntimeError(f"Expected exactly one instance of input {datasetType} "
577 f"for data ID {dataId}; got {refs}.")
578 scaffolding.refs.extend(refs)
579 # Look up [init] intermediate and output datasets in the output collection,
580 # unless clobberExisting is True (in which case we don't care if these
581 # already exist).
582 for datasetType, scaffolding in itertools.chain(self.initIntermediates.items(),
583 self.initOutputs.items(),
584 self.intermediates.items(),
585 self.outputs.items()):
586 for dataId in scaffolding.dataIds:
587 # TODO: we could easily support per-DatasetType skipExisting
588 # (it might make sense to put them in originInfo), and I could
589 # imagine that being useful - it's probably required in order
590 # to support writing initOutputs before QuantumGraph
591 # generation.
592 if run is not None:
593 ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run)
594 else:
595 ref = None
596 if ref is None:
597 ref = DatasetRef(datasetType, dataId)
598 elif not skipExisting:
599 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
600 f"output RUN collection '{run}' with data ID {dataId}.")
601 scaffolding.refs.append(ref)
602 # Prerequisite dataset lookups are deferred until fillQuanta.
604 def fillQuanta(self, registry, collections, *, skipExisting=True):
605 """Define quanta for each task by splitting up the datasets associated
606 with each task data ID.
608 This method populates `_TaskScaffolding.quanta`.
610 Parameters
611 ----------
612 registry : `lsst.daf.butler.Registry`
613 Registry for the data repository; used for all data ID queries.
614 collections : `lsst.daf.butler.CollectionSearch`
615 Object representing the collections to search for input datasets.
616 skipExisting : `bool`, optional
617 If `True` (default), a Quantum is not created if all its outputs
618 already exist.
619 """
620 for task in self.tasks:
621 for quantumDataId in task.dataIds:
622 # Identify the (regular) inputs that correspond to the Quantum
623 # with this data ID. These are those whose data IDs have the
624 # same values for all dimensions they have in common.
625 # We do this data IDs expanded to include implied dimensions,
626 # which is why _DatasetScaffolding.dimensions is thus expanded
627 # even though DatasetType.dimensions is not.
628 inputs = NamedKeyDict()
629 for datasetType, scaffolding in task.inputs.items():
630 inputs[datasetType] = [ref for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds)
631 if registry.relateDataIds(quantumDataId, dataId)]
633 _LOG.debug("%s dataId %s has inputs: %s",
634 task.taskDef.taskName, quantumDataId, list(inputs.names))
636 # Same for outputs.
637 outputs = NamedKeyDict()
638 allOutputsPresent = True
639 for datasetType, scaffolding in task.outputs.items():
640 outputs[datasetType] = []
641 for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds):
642 if registry.relateDataIds(quantumDataId, dataId):
643 if ref.id is None:
644 allOutputsPresent = False
645 else:
646 assert skipExisting, "Existing outputs should have already been identified."
647 if not allOutputsPresent:
648 raise OutputExistsError(f"Output {datasetType.name} with data ID "
649 f"{dataId} already exists, but other outputs "
650 f"for task with label {task.taskDef.label} "
651 f"and data ID {quantumDataId} do not.")
652 outputs[datasetType].append(ref)
653 if allOutputsPresent and skipExisting:
654 continue
656 _LOG.debug("%s dataID %s has outputs: %s",
657 task.taskDef.taskName, quantumDataId, list(outputs.names))
659 # Look up prerequisite datasets in the input collection(s).
660 # These may have dimensions that extend beyond those we queried
661 # for originally, because we want to permit those data ID
662 # values to differ across quanta and dataset types.
663 # For example, the same quantum may have a flat and bias with
664 # a different calibration_label, or a refcat with a skypix
665 # value that overlaps the quantum's data ID's region, but not
666 # the user expression used for the initial query.
667 connections = task.taskDef.connections
668 for con_name in connections.prerequisiteInputs:
669 con = getattr(connections, con_name)
670 for datasetType in task.prerequisites:
671 if datasetType.name == con.name:
672 break
673 if con.lookupFunction is not None:
674 refs = list(con.lookupFunction(datasetType, registry,
675 quantumDataId, collections))
676 else:
677 refs = list(
678 registry.queryDatasets(
679 datasetType,
680 collections=collections,
681 dataId=quantumDataId,
682 deduplicate=True,
683 expand=True,
684 )
685 )
686 inputs[datasetType] = refs
688 _LOG.debug("%s dataID %s has inputs+prereqs: %s",
689 task.taskDef.taskName, quantumDataId, list(inputs.names))
691 task.addQuantum(
692 Quantum(
693 taskName=task.taskDef.taskName,
694 taskClass=task.taskDef.taskClass,
695 dataId=quantumDataId,
696 initInputs=task.initInputs.unpackRefs(),
697 predictedInputs=inputs,
698 outputs=outputs,
699 )
700 )
702 def makeQuantumGraph(self):
703 """Create a `QuantumGraph` from the quanta already present in
704 the scaffolding data structure.
705 """
706 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
707 graph.initInputs = self.initInputs.unpackRefs()
708 graph.initOutputs = self.initOutputs.unpackRefs()
709 graph.initIntermediates = self.initIntermediates.unpackRefs()
710 return graph
713# ------------------------
714# Exported definitions --
715# ------------------------
718class GraphBuilderError(Exception):
719 """Base class for exceptions generated by graph builder.
720 """
721 pass
724class OutputExistsError(GraphBuilderError):
725 """Exception generated when output datasets already exist.
726 """
727 pass
730class PrerequisiteMissingError(GraphBuilderError):
731 """Exception generated when a prerequisite dataset does not exist.
732 """
733 pass
736class GraphBuilder(object):
737 """GraphBuilder class is responsible for building task execution graph from
738 a Pipeline.
740 Parameters
741 ----------
742 registry : `~lsst.daf.butler.Registry`
743 Data butler instance.
744 skipExisting : `bool`, optional
745 If `True` (default), a Quantum is not created if all its outputs
746 already exist.
747 """
749 def __init__(self, registry, skipExisting=True):
750 self.registry = registry
751 self.dimensions = registry.dimensions
752 self.skipExisting = skipExisting
754 def makeGraph(self, pipeline, collections, run, userQuery):
755 """Create execution graph for a pipeline.
757 Parameters
758 ----------
759 pipeline : `Pipeline`
760 Pipeline definition, task names/classes and their configs.
761 collections : `lsst.daf.butler.CollectionSearch`
762 Object representing the collections to search for input datasets.
763 run : `str`, optional
764 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
765 output datasets, if it already exists.
766 userQuery : `str`
767 String which defunes user-defined selection for registry, should be
768 empty or `None` if there is no restrictions on data selection.
770 Returns
771 -------
772 graph : `QuantumGraph`
774 Raises
775 ------
776 UserExpressionError
777 Raised when user expression cannot be parsed.
778 OutputExistsError
779 Raised when output datasets already exist.
780 Exception
781 Other exceptions types may be raised by underlying registry
782 classes.
783 """
784 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
785 scaffolding.fillDataIds(self.registry, collections, userQuery)
786 scaffolding.fillDatasetRefs(self.registry, collections, run, skipExisting=self.skipExisting)
787 scaffolding.fillQuanta(self.registry, collections, skipExisting=self.skipExisting)
788 return scaffolding.makeQuantumGraph()