Coverage for python/lsst/pipe/base/graphBuilder.py : 27%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from dataclasses import dataclass
34from typing import Set, List, Dict, Optional, Iterable
35import logging
37# -----------------------------
38# Imports for other modules --
39# -----------------------------
40from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
41from .graph import QuantumGraph, QuantumGraphTaskNodes
42from lsst.daf.butler import (
43 DatasetRef,
44 DatasetType,
45 DimensionGraph,
46 DimensionUniverse,
47 ExpandedDataCoordinate,
48 Quantum,
49)
50from lsst.daf.butler.core.utils import NamedKeyDict
52# ----------------------------------
53# Local non-exported definitions --
54# ----------------------------------
56_LOG = logging.getLogger(__name__.partition(".")[2])
59@dataclass
60class _DatasetScaffolding:
61 """Helper class aggregating information about a `DatasetType`, used when
62 constructing a `QuantumGraph`.
64 `_DatasetScaffolding` does not hold the `DatasetType` instance itself
65 because it is usually used as the value type in `_DatasetScaffoldingDict`,
66 which uses `DatasetType` instances as keys.
68 See `_PipelineScaffolding` for a top-down description of the full
69 scaffolding data structure.
71 Parameters
72 ----------
73 dimensions : `DimensionGraph`
74 Dimensions of the `DatasetType`.
75 """
76 def __init__(self, dimensions: DimensionGraph):
77 self.dimensions = dimensions
78 self.producer = None
79 self.consumers = {}
80 self.dataIds = set()
81 self.refs = []
83 __slots__ = ("dimensions", "producer", "consumers", "dataIds", "refs")
85 dimensions: DimensionGraph
86 """The dimensions of the dataset type (`DimensionGraph`).
88 Set during `_PipelineScaffolding` construction.
89 """
91 producer: Optional[_TaskScaffolding]
92 """The scaffolding objects for the Task that produces this dataset.
94 Set during `_PipelineScaffolding` construction.
95 """
97 consumers: Dict[str, _TaskScaffolding]
98 """The scaffolding objects for the Tasks that consume this dataset,
99 keyed by their label in the `Pipeline`.
101 Set during `_PipelineScaffolding` construction.
102 """
104 dataIds: Set[ExpandedDataCoordinate]
105 """Data IDs for all instances of this dataset type in the graph.
107 Populated after construction by `_PipelineScaffolding.fillDataIds`.
108 """
110 refs: List[DatasetRef]
111 """References for all instances of this dataset type in the graph.
113 Populated after construction by `_PipelineScaffolding.fillDatasetRefs`.
114 """
117class _DatasetScaffoldingDict(NamedKeyDict):
118 """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`.
120 See `_PipelineScaffolding` for a top-down description of the full
121 scaffolding data structure.
123 Parameters
124 ----------
125 args
126 Positional arguments are forwarded to the `dict` constructor.
127 universe : `DimensionUniverse`
128 Universe of all possible dimensions.
129 """
130 def __init__(self, *args, universe: DimensionGraph):
131 super().__init__(*args)
132 self.universe = universe
134 @classmethod
135 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
136 universe: DimensionUniverse) -> _DatasetScaffoldingDict:
137 """Construct a a dictionary from a flat iterable of `DatasetType` keys.
139 Parameters
140 ----------
141 datasetTypes : `iterable` of `DatasetType`
142 DatasetTypes to use as keys for the dict. Values will be
143 constructed from the dimensions of the keys.
144 universe : `DimensionUniverse`
145 Universe of all possible dimensions.
147 Returns
148 -------
149 dictionary : `_DatasetScaffoldingDict`
150 A new dictionary instance.
151 """
152 return cls(((datasetType, _DatasetScaffolding(datasetType.dimensions))
153 for datasetType in datasetTypes),
154 universe=universe)
156 @classmethod
157 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
158 *rest) -> _DatasetScaffoldingDict:
159 """Return a new dictionary by extracting items corresponding to the
160 given keys from one or more existing dictionaries.
162 Parameters
163 ----------
164 datasetTypes : `iterable` of `DatasetType`
165 DatasetTypes to use as keys for the dict. Values will be obtained
166 by lookups against ``first`` and ``rest``.
167 first : `_DatasetScaffoldingDict`
168 Another dictionary from which to extract values.
169 rest
170 Additional dictionaries from which to extract values.
172 Returns
173 -------
174 dictionary : `_DatasetScaffoldingDict`
175 A new dictionary instance.
176 """
177 combined = ChainMap(first, *rest)
178 return cls(((datasetType, combined[datasetType]) for datasetType in datasetTypes),
179 universe=first.universe)
181 @property
182 def dimensions(self) -> DimensionGraph:
183 """The union of all dimensions used by all dataset types in this
184 dictionary, including implied dependencies (`DimensionGraph`).
185 """
186 base = self.universe.empty
187 if len(self) == 0:
188 return base
189 return base.union(*[scaffolding.dimensions for scaffolding in self.values()])
191 def unpackRefs(self) -> NamedKeyDict:
192 """Unpack nested single-element `DatasetRef` lists into a new
193 dictionary.
195 This method assumes that each `_DatasetScaffolding.refs` list contains
196 exactly one `DatasetRef`, as is the case for all "init" datasets.
198 Returns
199 -------
200 dictionary : `NamedKeyDict`
201 Dictionary mapping `DatasetType` to `DatasetRef`, with both
202 `DatasetType` instances and string names usable as keys.
203 """
204 return NamedKeyDict((datasetType, scaffolding.refs[0]) for datasetType, scaffolding in self.items())
207@dataclass
208class _TaskScaffolding:
209 """Helper class aggregating information about a `PipelineTask`, used when
210 constructing a `QuantumGraph`.
212 See `_PipelineScaffolding` for a top-down description of the full
213 scaffolding data structure.
215 Parameters
216 ----------
217 taskDef : `TaskDef`
218 Data structure that identifies the task class and its config.
219 parent : `_PipelineScaffolding`
220 The parent data structure that will hold the instance being
221 constructed.
222 datasetTypes : `TaskDatasetTypes`
223 Data structure that categorizes the dataset types used by this task.
225 Raises
226 ------
227 GraphBuilderError
228 Raised if the task's dimensions are not a subset of the union of the
229 pipeline's dataset dimensions.
230 """
231 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
232 universe = parent.dimensions.universe
233 self.taskDef = taskDef
234 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
235 if not self.dimensions.issubset(parent.dimensions):
236 raise GraphBuilderError(f"Task with label '{taskDef.label}' has dimensions "
237 f"{self.dimensions} that are not a subset of "
238 f"the pipeline dimensions {parent.dimensions}.")
240 # Initialize _DatasetScaffoldingDicts as subsets of the one or two
241 # corresponding dicts in the parent _PipelineScaffolding.
242 self.initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
243 parent.initInputs, parent.initIntermediates)
244 self.initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
245 parent.initIntermediates, parent.initOutputs)
246 self.inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
247 parent.inputs, parent.intermediates)
248 self.outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
249 parent.intermediates, parent.outputs)
250 self.prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
251 parent.prerequisites)
252 # Add backreferences to the _DatasetScaffolding objects that point to
253 # this Task.
254 for dataset in itertools.chain(self.initInputs.values(), self.inputs.values(),
255 self.prerequisites.values()):
256 dataset.consumers[self.taskDef.label] = self
257 for dataset in itertools.chain(self.initOutputs.values(), self.outputs.values()):
258 assert dataset.producer is None
259 dataset.producer = self
260 self.dataIds = set()
261 self.quanta = []
263 taskDef: TaskDef
264 """Data structure that identifies the task class and its config
265 (`TaskDef`).
266 """
268 dimensions: DimensionGraph
269 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
270 """
272 initInputs: _DatasetScaffoldingDict
273 """Dictionary containing information about datasets used to construct this
274 task (`_DatasetScaffoldingDict`).
275 """
277 initOutputs: _DatasetScaffoldingDict
278 """Dictionary containing information about datasets produced as a
279 side-effect of constructing this task (`_DatasetScaffoldingDict`).
280 """
282 inputs: _DatasetScaffoldingDict
283 """Dictionary containing information about datasets used as regular,
284 graph-constraining inputs to this task (`_DatasetScaffoldingDict`).
285 """
287 outputs: _DatasetScaffoldingDict
288 """Dictionary containing information about datasets produced by this task
289 (`_DatasetScaffoldingDict`).
290 """
292 prerequisites: _DatasetScaffoldingDict
293 """Dictionary containing information about input datasets that must be
294 present in the repository before any Pipeline containing this task is run
295 (`_DatasetScaffoldingDict`).
296 """
298 dataIds: Set[ExpandedDataCoordinate]
299 """Data IDs for all quanta for this task in the graph (`set` of
300 `ExpandedDataCoordinate`).
302 Populated after construction by `_PipelineScaffolding.fillDataIds`.
303 """
305 quanta: List[Quantum]
306 """All quanta for this task in the graph (`list` of `Quantum`).
308 Populated after construction by `_PipelineScaffolding.fillQuanta`.
309 """
311 def addQuantum(self, quantum: Quantum):
312 config = self.taskDef.config
313 connectionClass = config.connections.ConnectionsClass
314 connectionInstance = connectionClass(config=config)
315 # This will raise if one of the check conditions is not met, which is the intended
316 # behavior
317 result = connectionInstance.adjustQuantum(quantum.predictedInputs)
318 quantum._predictedInputs = NamedKeyDict(result)
320 # If this function has reached this far add the quantum
321 self.quanta.append(quantum)
323 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
324 """Create a `QuantumGraphTaskNodes` instance from the information in
325 ``self``.
327 Returns
328 -------
329 nodes : `QuantumGraphTaskNodes`
330 The `QuantumGraph` elements corresponding to this task.
331 """
332 return QuantumGraphTaskNodes(
333 taskDef=self.taskDef,
334 quanta=self.quanta,
335 initInputs=self.initInputs.unpackRefs(),
336 initOutputs=self.initOutputs.unpackRefs(),
337 )
340@dataclass
341class _PipelineScaffolding:
342 """A helper data structure that organizes the information involved in
343 constructing a `QuantumGraph` for a `Pipeline`.
345 Parameters
346 ----------
347 pipeline : `Pipeline`
348 Sequence of tasks from which a graph is to be constructed. Must
349 have nested task classes already imported.
350 universe : `DimensionUniverse`
351 Universe of all possible dimensions.
353 Raises
354 ------
355 GraphBuilderError
356 Raised if the task's dimensions are not a subset of the union of the
357 pipeline's dataset dimensions.
359 Notes
360 -----
361 The scaffolding data structure contains nested data structures for both
362 tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the
363 latter held by `_DatasetScaffoldingDict`. The dataset data structures are
364 shared between the pipeline-level structure (which aggregates all datasets
365 and categorizes them from the perspective of the complete pipeline) and the
366 individual tasks that use them as inputs and outputs.
368 `QuantumGraph` construction proceeds in five steps, with each corresponding
369 to a different `_PipelineScaffolding` method:
371 1. When `_PipelineScaffolding` is constructed, we extract and categorize
372 the DatasetTypes used by the pipeline (delegating to
373 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
374 nested `_TaskScaffolding` and `_DatasetScaffolding` objects.
376 2. In `fillDataIds`, we construct and run the "Big Join Query", which
377 returns related tuples of all dimensions used to identify any regular
378 input, output, and intermediate datasets (not prerequisites). We then
379 iterate over these tuples of related dimensions, identifying the subsets
380 that correspond to distinct data IDs for each task and dataset type.
382 3. In `fillDatasetRefs`, we run follow-up queries against all of the
383 dataset data IDs previously identified, populating the
384 `_DatasetScaffolding.refs` lists - except for those for prerequisite
385 datasets, which cannot be resolved until distinct quanta are
386 identified.
388 4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into
389 the inputs and outputs for each `Quantum` and search for prerequisite
390 datasets, populating `_TaskScaffolding.quanta`.
392 5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
393 per-task quanta identified in the previous step.
394 """
395 def __init__(self, pipeline, *, registry):
396 self.tasks = []
397 # Aggregate and categorize the DatasetTypes in the Pipeline.
398 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
399 # Construct dictionaries that map those DatasetTypes to structures
400 # that will (later) hold addiitonal information about them.
401 for attr in ("initInputs", "initIntermediates", "initOutputs",
402 "inputs", "intermediates", "outputs", "prerequisites"):
403 setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
404 universe=registry.dimensions))
405 # Aggregate all dimensions for all non-init, non-prerequisite
406 # DatasetTypes. These are the ones we'll include in the big join query.
407 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
408 self.outputs.dimensions)
409 # Construct scaffolding nodes for each Task, and add backreferences
410 # to the Task from each DatasetScaffolding node.
411 # Note that there's only one scaffolding node for each DatasetType, shared by
412 # _PipelineScaffolding and all _TaskScaffoldings that reference it.
413 if isinstance(pipeline, Pipeline):
414 pipeline = pipeline.toExpandedPipeline()
415 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
416 for taskDef, taskDatasetTypes in zip(pipeline,
417 datasetTypes.byTask.values())]
419 tasks: List[_TaskScaffolding]
420 """Scaffolding data structures for each task in the pipeline
421 (`list` of `_TaskScaffolding`).
422 """
424 initInputs: _DatasetScaffoldingDict
425 """Datasets consumed but not produced when constructing the tasks in this
426 pipeline (`_DatasetScaffoldingDict`).
427 """
429 initIntermediates: _DatasetScaffoldingDict
430 """Datasets that are both consumed and produced when constructing the tasks
431 in this pipeline (`_DatasetScaffoldingDict`).
432 """
434 initOutputs: _DatasetScaffoldingDict
435 """Datasets produced but not consumed when constructing the tasks in this
436 pipeline (`_DatasetScaffoldingDict`).
437 """
439 inputs: _DatasetScaffoldingDict
440 """Datasets that are consumed but not produced when running this pipeline
441 (`_DatasetScaffoldingDict`).
442 """
444 intermediates: _DatasetScaffoldingDict
445 """Datasets that are both produced and consumed when running this pipeline
446 (`_DatasetScaffoldingDict`).
447 """
449 outputs: _DatasetScaffoldingDict
450 """Datasets produced but not consumed when when running this pipeline
451 (`_DatasetScaffoldingDict`).
452 """
454 prerequisites: _DatasetScaffoldingDict
455 """Datasets that are consumed when running this pipeline and looked up
456 per-Quantum when generating the graph (`_DatasetScaffoldingDict`).
457 """
459 dimensions: DimensionGraph
460 """All dimensions used by any regular input, intermediate, or output
461 (not prerequisite) dataset; the set of dimension used in the "Big Join
462 Query" (`DimensionGraph`).
464 This is required to be a superset of all task quantum dimensions.
465 """
467 def fillDataIds(self, registry, collections, userQuery):
468 """Query for the data IDs that connect nodes in the `QuantumGraph`.
470 This method populates `_TaskScaffolding.dataIds` and
471 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
473 Parameters
474 ----------
475 registry : `lsst.daf.butler.Registry`
476 Registry for the data repository; used for all data ID queries.
477 collections : `lsst.daf.butler.CollectionSearch`
478 Object representing the collections to search for input datasets.
479 userQuery : `str`, optional
480 User-provided expression to limit the data IDs processed.
481 """
482 # Initialization datasets always have empty data IDs.
483 emptyDataId = ExpandedDataCoordinate(registry.dimensions.empty, (), records={})
484 for scaffolding in itertools.chain(self.initInputs.values(),
485 self.initIntermediates.values(),
486 self.initOutputs.values()):
487 scaffolding.dataIds.add(emptyDataId)
488 # Run one big query for the data IDs for task dimensions and regular
489 # inputs and outputs. We limit the query to only dimensions that are
490 # associated with the input dataset types, but don't (yet) try to
491 # obtain the dataset_ids for those inputs.
492 resultIter = registry.queryDimensions(
493 self.dimensions,
494 datasets=list(self.inputs),
495 collections=collections,
496 where=userQuery,
497 )
498 # Iterate over query results and populate the data IDs in
499 # self._TaskScaffolding.refs, extracting the subsets of the common data
500 # ID from the query corresponding to the dimensions of each. By using
501 # sets, we remove duplicates caused by query rows in which the
502 # dimensions that change are not relevant for that task or dataset
503 # type. For example, if the Big Join Query involves the dimensions
504 # (instrument, visit, detector, skymap, tract, patch), we extract
505 # "calexp" data IDs from the instrument, visit, and detector values
506 # only, and rely on `set.add` to avoid duplications due to result rows
507 # in which only skymap, tract, and patch are varying. The Big Join
508 # Query is defined such that only visit+detector and tract+patch
509 # combinations that represent spatial overlaps are included in the
510 # results.
511 for commonDataId in resultIter:
512 for taskScaffolding in self.tasks:
513 taskScaffolding.dataIds.add(commonDataId.subset(taskScaffolding.dimensions))
514 for datasetType, scaffolding in itertools.chain(self.inputs.items(),
515 self.intermediates.items(),
516 self.outputs.items()):
517 scaffolding.dataIds.add(commonDataId.subset(scaffolding.dimensions))
519 def fillDatasetRefs(self, registry, collections, run, *, skipExisting=True):
520 """Perform follow up queries for each dataset data ID produced in
521 `fillDataIds`.
523 This method populates `_DatasetScaffolding.refs` (except for those in
524 `prerequisites`).
526 Parameters
527 ----------
528 registry : `lsst.daf.butler.Registry`
529 Registry for the data repository; used for all data ID queries.
530 collections : `lsst.daf.butler.CollectionSearch`
531 Object representing the collections to search for input datasets.
532 run : `str`, optional
533 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
534 output datasets, if it already exists.
535 skipExisting : `bool`, optional
536 If `True` (default), a Quantum is not created if all its outputs
537 already exist in ``run``. Ignored if ``run`` is `None`.
539 Raises
540 ------
541 OutputExistsError
542 Raised if an output dataset already exists in the output run
543 and ``skipExisting`` is `False`. The case where some but not all
544 of a quantum's outputs are present and ``skipExisting`` is `True`
545 cannot be identified at this stage, and is handled by `fillQuanta`
546 instead.
547 """
548 # Look up input and initInput datasets in the input collection(s).
549 for datasetType, scaffolding in itertools.chain(self.initInputs.items(), self.inputs.items()):
550 for dataId in scaffolding.dataIds:
551 refs = list(
552 registry.queryDatasets(
553 datasetType,
554 collections=collections,
555 dataId=dataId,
556 deduplicate=True,
557 expand=True,
558 )
559 )
560 assert len(refs) == 1, "BJQ guarantees exactly one input for each data ID."
561 scaffolding.refs.extend(refs)
562 # Look up [init] intermediate and output datasets in the output collection,
563 # unless clobberExisting is True (in which case we don't care if these
564 # already exist).
565 for datasetType, scaffolding in itertools.chain(self.initIntermediates.items(),
566 self.initOutputs.items(),
567 self.intermediates.items(),
568 self.outputs.items()):
569 for dataId in scaffolding.dataIds:
570 # TODO: we could easily support per-DatasetType skipExisting
571 # (it might make sense to put them in originInfo), and I could
572 # imagine that being useful - it's probably required in order
573 # to support writing initOutputs before QuantumGraph
574 # generation.
575 if run is not None:
576 ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run)
577 else:
578 ref = None
579 if ref is None:
580 ref = DatasetRef(datasetType, dataId)
581 elif not skipExisting:
582 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
583 f"output RUN collection '{run}' with data ID {dataId}.")
584 scaffolding.refs.append(ref)
585 # Prerequisite dataset lookups are deferred until fillQuanta.
587 def fillQuanta(self, registry, collections, *, skipExisting=True):
588 """Define quanta for each task by splitting up the datasets associated
589 with each task data ID.
591 This method populates `_TaskScaffolding.quanta`.
593 Parameters
594 ----------
595 registry : `lsst.daf.butler.Registry`
596 Registry for the data repository; used for all data ID queries.
597 collections : `lsst.daf.butler.CollectionSearch`
598 Object representing the collections to search for input datasets.
599 skipExisting : `bool`, optional
600 If `True` (default), a Quantum is not created if all its outputs
601 already exist.
602 """
603 for task in self.tasks:
604 for quantumDataId in task.dataIds:
605 # Identify the (regular) inputs that correspond to the Quantum
606 # with this data ID. These are those whose data IDs have the
607 # same values for all dimensions they have in common.
608 # We do this data IDs expanded to include implied dimensions,
609 # which is why _DatasetScaffolding.dimensions is thus expanded
610 # even though DatasetType.dimensions is not.
611 inputs = NamedKeyDict()
612 for datasetType, scaffolding in task.inputs.items():
613 inputs[datasetType] = [ref for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds)
614 if quantumDataId.matches(dataId)]
616 _LOG.debug("%s dataId %s has inputs: %s",
617 task.taskDef.taskName, quantumDataId, list(inputs.names))
619 # Same for outputs.
620 outputs = NamedKeyDict()
621 allOutputsPresent = True
622 for datasetType, scaffolding in task.outputs.items():
623 outputs[datasetType] = []
624 for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds):
625 if quantumDataId.matches(dataId):
626 if ref.id is None:
627 allOutputsPresent = False
628 else:
629 assert skipExisting, "Existing outputs should have already been identified."
630 if not allOutputsPresent:
631 raise OutputExistsError(f"Output {datasetType.name} with data ID "
632 f"{dataId} already exists, but other outputs "
633 f"for task with label {task.taskDef.label} "
634 f"and data ID {quantumDataId} do not.")
635 outputs[datasetType].append(ref)
636 if allOutputsPresent and skipExisting:
637 continue
639 _LOG.debug("%s dataID %s has outputs: %s",
640 task.taskDef.taskName, quantumDataId, list(outputs.names))
642 # Look up prerequisite datasets in the input collection(s).
643 # These may have dimensions that extend beyond those we queried
644 # for originally, because we want to permit those data ID
645 # values to differ across quanta and dataset types.
646 # For example, the same quantum may have a flat and bias with
647 # a different calibration_label, or a refcat with a skypix
648 # value that overlaps the quantum's data ID's region, but not
649 # the user expression used for the initial query.
650 connections = task.taskDef.connections
651 for con_name in connections.prerequisiteInputs:
652 con = getattr(connections, con_name)
653 for datasetType in task.prerequisites:
654 if datasetType.name == con.name:
655 break
656 if con.lookupFunction is not None:
657 refs = list(con.lookupFunction(datasetType, registry,
658 quantumDataId, collections))
659 else:
660 refs = list(
661 registry.queryDatasets(
662 datasetType,
663 collections=collections,
664 dataId=quantumDataId,
665 deduplicate=True,
666 expand=True,
667 )
668 )
669 inputs[datasetType] = refs
671 _LOG.debug("%s dataID %s has inputs+prereqs: %s",
672 task.taskDef.taskName, quantumDataId, list(inputs.names))
674 task.addQuantum(
675 Quantum(
676 taskName=task.taskDef.taskName,
677 taskClass=task.taskDef.taskClass,
678 dataId=quantumDataId,
679 initInputs=task.initInputs.unpackRefs(),
680 predictedInputs=inputs,
681 outputs=outputs,
682 )
683 )
685 def makeQuantumGraph(self):
686 """Create a `QuantumGraph` from the quanta already present in
687 the scaffolding data structure.
688 """
689 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
690 graph.initInputs = self.initInputs.unpackRefs()
691 graph.initOutputs = self.initOutputs.unpackRefs()
692 graph.initIntermediates = self.initIntermediates.unpackRefs()
693 return graph
696# ------------------------
697# Exported definitions --
698# ------------------------
701class GraphBuilderError(Exception):
702 """Base class for exceptions generated by graph builder.
703 """
704 pass
707class OutputExistsError(GraphBuilderError):
708 """Exception generated when output datasets already exist.
709 """
710 pass
713class PrerequisiteMissingError(GraphBuilderError):
714 """Exception generated when a prerequisite dataset does not exist.
715 """
716 pass
719class GraphBuilder(object):
720 """GraphBuilder class is responsible for building task execution graph from
721 a Pipeline.
723 Parameters
724 ----------
725 registry : `~lsst.daf.butler.Registry`
726 Data butler instance.
727 skipExisting : `bool`, optional
728 If `True` (default), a Quantum is not created if all its outputs
729 already exist.
730 """
732 def __init__(self, registry, skipExisting=True):
733 self.registry = registry
734 self.dimensions = registry.dimensions
735 self.skipExisting = skipExisting
737 def makeGraph(self, pipeline, collections, run, userQuery):
738 """Create execution graph for a pipeline.
740 Parameters
741 ----------
742 pipeline : `Pipeline`
743 Pipeline definition, task names/classes and their configs.
744 collections : `lsst.daf.butler.CollectionSearch`
745 Object representing the collections to search for input datasets.
746 run : `str`, optional
747 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
748 output datasets, if it already exists.
749 userQuery : `str`
750 String which defunes user-defined selection for registry, should be
751 empty or `None` if there is no restrictions on data selection.
753 Returns
754 -------
755 graph : `QuantumGraph`
757 Raises
758 ------
759 UserExpressionError
760 Raised when user expression cannot be parsed.
761 OutputExistsError
762 Raised when output datasets already exist.
763 Exception
764 Other exceptions types may be raised by underlying registry
765 classes.
766 """
767 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
768 scaffolding.fillDataIds(self.registry, collections, userQuery)
769 scaffolding.fillDatasetRefs(self.registry, collections, run, skipExisting=self.skipExisting)
770 scaffolding.fillQuanta(self.registry, collections, skipExisting=self.skipExisting)
771 return scaffolding.makeQuantumGraph()