Coverage for python/lsst/pipe/base/graphBuilder.py : 24%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from dataclasses import dataclass
34from typing import Dict, Iterable, Iterator, List
35import logging
37# -----------------------------
38# Imports for other modules --
39# -----------------------------
40from .connections import iterConnections
41from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
42from .graph import QuantumGraph, QuantumGraphTaskNodes
43from lsst.daf.butler import (
44 DataCoordinate,
45 DatasetRef,
46 DatasetType,
47 DimensionGraph,
48 DimensionUniverse,
49 ExpandedDataCoordinate,
50 NamedKeyDict,
51 Quantum,
52)
54# ----------------------------------
55# Local non-exported definitions --
56# ----------------------------------
58_LOG = logging.getLogger(__name__.partition(".")[2])
61class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
62 """A custom dictionary that maps `DatasetType` to a nested dictionary of
63 the known `DatasetRef` instances of that type.
65 Parameters
66 ----------
67 args
68 Positional arguments are forwarded to the `dict` constructor.
69 universe : `DimensionUniverse`
70 Universe of all possible dimensions.
71 """
72 def __init__(self, *args, universe: DimensionGraph):
73 super().__init__(*args)
74 self.universe = universe
76 @classmethod
77 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
78 universe: DimensionUniverse) -> _DatasetDict:
79 """Construct a dictionary from a flat iterable of `DatasetType` keys.
81 Parameters
82 ----------
83 datasetTypes : `iterable` of `DatasetType`
84 DatasetTypes to use as keys for the dict. Values will be empty
85 dictionaries.
86 universe : `DimensionUniverse`
87 Universe of all possible dimensions.
89 Returns
90 -------
91 dictionary : `_DatasetDict`
92 A new `_DatasetDict` instance.
93 """
94 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
96 @classmethod
97 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
98 ) -> _DatasetDict:
99 """Return a new dictionary by extracting items corresponding to the
100 given keys from one or more existing dictionaries.
102 Parameters
103 ----------
104 datasetTypes : `iterable` of `DatasetType`
105 DatasetTypes to use as keys for the dict. Values will be obtained
106 by lookups against ``first`` and ``rest``.
107 first : `_DatasetDict`
108 Another dictionary from which to extract values.
109 rest
110 Additional dictionaries from which to extract values.
112 Returns
113 -------
114 dictionary : `_DatasetDict`
115 A new dictionary instance.
116 """
117 combined = ChainMap(first, *rest)
118 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
119 universe=first.universe)
121 @property
122 def dimensions(self) -> DimensionGraph:
123 """The union of all dimensions used by all dataset types in this
124 dictionary, including implied dependencies (`DimensionGraph`).
125 """
126 base = self.universe.empty
127 if len(self) == 0:
128 return base
129 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
131 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
132 """Unpack nested single-element `DatasetRef` dicts into a new
133 mapping with `DatasetType` keys and `DatasetRef` values.
135 This method assumes that each nest contains exactly one item, as is the
136 case for all "init" datasets.
138 Returns
139 -------
140 dictionary : `NamedKeyDict`
141 Dictionary mapping `DatasetType` to `DatasetRef`, with both
142 `DatasetType` instances and string names usable as keys.
143 """
144 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
145 ref, = refs.values()
146 return ref
147 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
149 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
150 """Unpack nested multi-element `DatasetRef` dicts into a new
151 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
153 Returns
154 -------
155 dictionary : `NamedKeyDict`
156 Dictionary mapping `DatasetType` to `DatasetRef`, with both
157 `DatasetType` instances and string names usable as keys.
158 """
159 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
161 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
162 ) -> Iterator[DatasetRef]:
163 """Iterate over the contained `DatasetRef` instances that match the
164 given `DatasetType` and data IDs.
166 Parameters
167 ----------
168 datasetType : `DatasetType`
169 Dataset type to match.
170 dataIds : `Iterable` [ `DataCoordinate` ]
171 Data IDs to match.
173 Returns
174 -------
175 refs : `Iterator` [ `DatasetRef` ]
176 DatasetRef instances for which ``ref.datasetType == datasetType``
177 and ``ref.dataId`` is in ``dataIds``.
178 """
179 refs = self[datasetType]
180 return (refs[dataId] for dataId in dataIds)
183class _QuantumScaffolding:
184 """Helper class aggregating information about a `Quantum`, used when
185 constructing a `QuantumGraph`.
187 See `_PipelineScaffolding` for a top-down description of the full
188 scaffolding data structure.
190 Parameters
191 ----------
192 task : _TaskScaffolding
193 Back-reference to the helper object for the `PipelineTask` this quantum
194 represents an execution of.
195 dataId : `DataCoordinate`
196 Data ID for this quantum.
197 """
198 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
199 self.task = task
200 self.dataId = dataId
201 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
202 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
203 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
204 universe=dataId.universe)
206 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
208 def __repr__(self):
209 return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
211 task: _TaskScaffolding
212 """Back-reference to the helper object for the `PipelineTask` this quantum
213 represents an execution of.
214 """
216 dataId: DataCoordinate
217 """Data ID for this quantum.
218 """
220 inputs: _DatasetDict
221 """Nested dictionary containing `DatasetRef` inputs to this quantum.
223 This is initialized to map each `DatasetType` to an empty dictionary at
224 construction. Those nested dictionaries are populated (with data IDs as
225 keys) with unresolved `DatasetRef` instances in
226 `_PipelineScaffolding.connectDataIds`.
227 """
229 outputs: _DatasetDict
230 """Nested dictionary containing `DatasetRef` outputs this quantum.
231 """
233 prerequisites: _DatasetDict
234 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
235 quantum.
236 """
238 def makeQuantum(self) -> Quantum:
239 """Transform the scaffolding object into a true `Quantum` instance.
241 Returns
242 -------
243 quantum : `Quantum`
244 An actual `Quantum` instance.
245 """
246 allInputs = self.inputs.unpackMultiRefs()
247 allInputs.update(self.prerequisites.unpackMultiRefs())
248 # Give the task's Connections class an opportunity to remove some
249 # inputs, or complain if they are unacceptable.
250 config = self.task.taskDef.config
251 connections = config.connections.ConnectionsClass(config=config)
252 # This will raise if one of the check conditions is not met, which is the intended
253 # behavior
254 allInputs = connections.adjustQuantum(allInputs)
255 return Quantum(
256 taskName=self.task.taskDef.taskName,
257 taskClass=self.task.taskDef.taskClass,
258 dataId=self.dataId,
259 initInputs=self.task.initInputs.unpackSingleRefs(),
260 predictedInputs=allInputs,
261 outputs=self.outputs.unpackMultiRefs(),
262 )
265@dataclass
266class _TaskScaffolding:
267 """Helper class aggregating information about a `PipelineTask`, used when
268 constructing a `QuantumGraph`.
270 See `_PipelineScaffolding` for a top-down description of the full
271 scaffolding data structure.
273 Parameters
274 ----------
275 taskDef : `TaskDef`
276 Data structure that identifies the task class and its config.
277 parent : `_PipelineScaffolding`
278 The parent data structure that will hold the instance being
279 constructed.
280 datasetTypes : `TaskDatasetTypes`
281 Data structure that categorizes the dataset types used by this task.
282 """
283 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
284 universe = parent.dimensions.universe
285 self.taskDef = taskDef
286 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
287 assert self.dimensions.issubset(parent.dimensions)
288 # Initialize _DatasetDicts as subsets of the one or two
289 # corresponding dicts in the parent _PipelineScaffolding.
290 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
291 parent.initIntermediates)
292 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
293 parent.initOutputs)
294 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
295 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
296 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
297 self.dataIds = set()
298 self.quanta = {}
300 def __repr__(self):
301 # Default dataclass-injected __repr__ gets caught in an infinite loop
302 # because of back-references.
303 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
305 taskDef: TaskDef
306 """Data structure that identifies the task class and its config
307 (`TaskDef`).
308 """
310 dimensions: DimensionGraph
311 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
312 """
314 initInputs: _DatasetDict
315 """Dictionary containing information about datasets used to construct this
316 task (`_DatasetDict`).
317 """
319 initOutputs: _DatasetDict
320 """Dictionary containing information about datasets produced as a
321 side-effect of constructing this task (`_DatasetDict`).
322 """
324 inputs: _DatasetDict
325 """Dictionary containing information about datasets used as regular,
326 graph-constraining inputs to this task (`_DatasetDict`).
327 """
329 outputs: _DatasetDict
330 """Dictionary containing information about datasets produced by this task
331 (`_DatasetDict`).
332 """
334 prerequisites: _DatasetDict
335 """Dictionary containing information about input datasets that must be
336 present in the repository before any Pipeline containing this task is run
337 (`_DatasetDict`).
338 """
340 quanta: Dict[DataCoordinate, _QuantumScaffolding]
341 """Dictionary mapping data ID to a scaffolding object for the Quantum of
342 this task with that data ID.
343 """
345 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
346 """Create a `QuantumGraphTaskNodes` instance from the information in
347 ``self``.
349 Returns
350 -------
351 nodes : `QuantumGraphTaskNodes`
352 The `QuantumGraph` elements corresponding to this task.
353 """
354 return QuantumGraphTaskNodes(
355 taskDef=self.taskDef,
356 quanta=[q.makeQuantum() for q in self.quanta.values()],
357 initInputs=self.initInputs.unpackSingleRefs(),
358 initOutputs=self.initOutputs.unpackSingleRefs(),
359 )
362@dataclass
363class _PipelineScaffolding:
364 """A helper data structure that organizes the information involved in
365 constructing a `QuantumGraph` for a `Pipeline`.
367 Parameters
368 ----------
369 pipeline : `Pipeline`
370 Sequence of tasks from which a graph is to be constructed. Must
371 have nested task classes already imported.
372 universe : `DimensionUniverse`
373 Universe of all possible dimensions.
375 Notes
376 -----
377 The scaffolding data structure contains nested data structures for both
378 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
379 data structures are shared between the pipeline-level structure (which
380 aggregates all datasets and categorizes them from the perspective of the
381 complete pipeline) and the individual tasks that use them as inputs and
382 outputs.
384 `QuantumGraph` construction proceeds in four steps, with each corresponding
385 to a different `_PipelineScaffolding` method:
387 1. When `_PipelineScaffolding` is constructed, we extract and categorize
388 the DatasetTypes used by the pipeline (delegating to
389 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
390 nested `_TaskScaffolding` and `_DatasetDict` objects.
392 2. In `connectDataIds`, we construct and run the "Big Join Query", which
393 returns related tuples of all dimensions used to identify any regular
394 input, output, and intermediate datasets (not prerequisites). We then
395 iterate over these tuples of related dimensions, identifying the subsets
396 that correspond to distinct data IDs for each task and dataset type,
397 and then create `_QuantumScaffolding` objects.
399 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
400 dataset data IDs previously identified, transforming unresolved
401 DatasetRefs into resolved DatasetRefs where appropriate. We then look
402 up prerequisite datasets for all quanta.
404 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
405 per-task `_QuantumScaffolding` objects.
406 """
407 def __init__(self, pipeline, *, registry):
408 _LOG.debug("Initializing data structures for QuantumGraph generation.")
409 self.tasks = []
410 # Aggregate and categorize the DatasetTypes in the Pipeline.
411 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
412 # Construct dictionaries that map those DatasetTypes to structures
413 # that will (later) hold addiitonal information about them.
414 for attr in ("initInputs", "initIntermediates", "initOutputs",
415 "inputs", "intermediates", "outputs", "prerequisites"):
416 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
417 universe=registry.dimensions))
418 # Aggregate all dimensions for all non-init, non-prerequisite
419 # DatasetTypes. These are the ones we'll include in the big join query.
420 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
421 self.outputs.dimensions)
422 # Construct scaffolding nodes for each Task, and add backreferences
423 # to the Task from each DatasetScaffolding node.
424 # Note that there's only one scaffolding node for each DatasetType, shared by
425 # _PipelineScaffolding and all _TaskScaffoldings that reference it.
426 if isinstance(pipeline, Pipeline):
427 pipeline = pipeline.toExpandedPipeline()
428 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
429 for taskDef, taskDatasetTypes in zip(pipeline,
430 datasetTypes.byTask.values())]
432 def __repr__(self):
433 # Default dataclass-injected __repr__ gets caught in an infinite loop
434 # because of back-references.
435 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
437 tasks: List[_TaskScaffolding]
438 """Scaffolding data structures for each task in the pipeline
439 (`list` of `_TaskScaffolding`).
440 """
442 initInputs: _DatasetDict
443 """Datasets consumed but not produced when constructing the tasks in this
444 pipeline (`_DatasetDict`).
445 """
447 initIntermediates: _DatasetDict
448 """Datasets that are both consumed and produced when constructing the tasks
449 in this pipeline (`_DatasetDict`).
450 """
452 initOutputs: _DatasetDict
453 """Datasets produced but not consumed when constructing the tasks in this
454 pipeline (`_DatasetDict`).
455 """
457 inputs: _DatasetDict
458 """Datasets that are consumed but not produced when running this pipeline
459 (`_DatasetDict`).
460 """
462 intermediates: _DatasetDict
463 """Datasets that are both produced and consumed when running this pipeline
464 (`_DatasetDict`).
465 """
467 outputs: _DatasetDict
468 """Datasets produced but not consumed when when running this pipeline
469 (`_DatasetDict`).
470 """
472 prerequisites: _DatasetDict
473 """Datasets that are consumed when running this pipeline and looked up
474 per-Quantum when generating the graph (`_DatasetDict`).
475 """
477 dimensions: DimensionGraph
478 """All dimensions used by any regular input, intermediate, or output
479 (not prerequisite) dataset; the set of dimension used in the "Big Join
480 Query" (`DimensionGraph`).
482 This is required to be a superset of all task quantum dimensions.
483 """
485 def connectDataIds(self, registry, collections, userQuery):
486 """Query for the data IDs that connect nodes in the `QuantumGraph`.
488 This method populates `_TaskScaffolding.dataIds` and
489 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
491 Parameters
492 ----------
493 registry : `lsst.daf.butler.Registry`
494 Registry for the data repository; used for all data ID queries.
495 collections : `lsst.daf.butler.CollectionSearch`
496 Object representing the collections to search for input datasets.
497 userQuery : `str`, optional
498 User-provided expression to limit the data IDs processed.
499 """
500 _LOG.debug("Building query for data IDs.")
501 # Initialization datasets always have empty data IDs.
502 emptyDataId = ExpandedDataCoordinate(registry.dimensions.empty, (), records={})
503 for datasetType, refs in itertools.chain(self.initInputs.items(),
504 self.initIntermediates.items(),
505 self.initOutputs.items()):
506 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
507 # Run one big query for the data IDs for task dimensions and regular
508 # inputs and outputs. We limit the query to only dimensions that are
509 # associated with the input dataset types, but don't (yet) try to
510 # obtain the dataset_ids for those inputs.
511 _LOG.debug("Submitting data ID query and processing results.")
512 resultIter = registry.queryDimensions(
513 self.dimensions,
514 datasets=list(self.inputs),
515 collections=collections,
516 where=userQuery,
517 )
518 # Iterate over query results, populating data IDs for datasets and
519 # quanta and then connecting them to each other.
520 for n, commonDataId in enumerate(resultIter):
521 # Create DatasetRefs for all DatasetTypes from this result row,
522 # noting that we might have created some already.
523 # We remember both those that already existed and those that we
524 # create now.
525 refsForRow = {}
526 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
527 self.outputs.items()):
528 datasetDataId = commonDataId.subset(datasetType.dimensions)
529 ref = refs.get(datasetDataId)
530 if ref is None:
531 ref = DatasetRef(datasetType, datasetDataId)
532 refs[datasetDataId] = ref
533 refsForRow[datasetType.name] = ref
534 # Create _QuantumScaffolding objects for all tasks from this result
535 # row, noting that we might have created some already.
536 for task in self.tasks:
537 quantumDataId = commonDataId.subset(task.dimensions)
538 quantum = task.quanta.get(quantumDataId)
539 if quantum is None:
540 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
541 task.quanta[quantumDataId] = quantum
542 # Whether this is a new quantum or an existing one, we can now
543 # associate the DatasetRefs for this row with it. The fact
544 # the fact that a Quantum data ID and a dataset data ID both
545 # came from the same result row is what tells us they should
546 # be associated.
547 # Many of these associates will be duplicates (because another
548 # query row that differed from this one only in irrelevant
549 # dimensions already added them), and we use sets to skip.
550 for datasetType in task.inputs:
551 ref = refsForRow[datasetType.name]
552 quantum.inputs[datasetType.name][ref.dataId] = ref
553 for datasetType in task.outputs:
554 ref = refsForRow[datasetType.name]
555 quantum.outputs[datasetType.name][ref.dataId] = ref
556 _LOG.debug("Finished processing %d rows from data ID query.", n)
558 def resolveDatasetRefs(self, registry, collections, run, *, skipExisting=True):
559 """Perform follow up queries for each dataset data ID produced in
560 `fillDataIds`.
562 This method populates `_DatasetScaffolding.refs` (except for those in
563 `prerequisites`).
565 Parameters
566 ----------
567 registry : `lsst.daf.butler.Registry`
568 Registry for the data repository; used for all data ID queries.
569 collections : `lsst.daf.butler.CollectionSearch`
570 Object representing the collections to search for input datasets.
571 run : `str`, optional
572 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
573 output datasets, if it already exists.
574 skipExisting : `bool`, optional
575 If `True` (default), a Quantum is not created if all its outputs
576 already exist in ``run``. Ignored if ``run`` is `None`.
578 Raises
579 ------
580 OutputExistsError
581 Raised if an output dataset already exists in the output run
582 and ``skipExisting`` is `False`. The case where some but not all
583 of a quantum's outputs are present and ``skipExisting`` is `True`
584 cannot be identified at this stage, and is handled by `fillQuanta`
585 instead.
586 """
587 # Look up [init] intermediate and output datasets in the output
588 # collection, if there is an output collection.
589 if run is not None:
590 for datasetType, refs in itertools.chain(self.initIntermediates.items(),
591 self.initOutputs.items(),
592 self.intermediates.items(),
593 self.outputs.items()):
594 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
595 len(refs), datasetType.name)
596 for dataId, unresolvedRef in refs.items():
597 # TODO: we could easily support per-DatasetType
598 # skipExisting and I could imagine that being useful - it's
599 # probably required in order to support writing initOutputs
600 # before QuantumGraph generation.
601 ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run)
602 if ref is not None:
603 if skipExisting:
604 refs[dataId] = ref
605 else:
606 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
607 f"output RUN collection '{run}' with data ID {dataId}.")
608 # Look up input and initInput datasets in the input collection(s).
609 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
610 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
611 for dataId in refs:
612 refs[dataId] = registry.findDataset(datasetType, dataId=dataId, collections=collections)
613 if any(ref is None for ref in refs.values()):
614 raise RuntimeError(
615 f"One or more dataset of type '{datasetType.name}' was "
616 f"present in a previous query, but could not be found now."
617 f"This is either a logic bug in QuantumGraph generation, "
618 f"or the input collections have been modified since "
619 f"QuantumGraph generation began."
620 )
621 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
622 # replacing the unresolved refs there, and then look up prerequisites.
623 for task in self.tasks:
624 _LOG.debug(
625 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
626 len(task.quanta),
627 task.taskDef.label
628 )
629 lookupFunctions = {
630 c.name: c.lookupFunction
631 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
632 if c.lookupFunction is not None
633 }
634 dataIdsToSkip = []
635 for quantum in task.quanta.values():
636 # Process outputs datasets only if there is a run to look for
637 # outputs in and skipExisting is True. Note that if
638 # skipExisting is False, any output datasets that already exist
639 # would have already caused an exception to be raised.
640 # We never update the DatasetRefs in the quantum because those
641 # should never be resolved.
642 if run is not None and skipExisting:
643 resolvedRefs = []
644 unresolvedRefs = []
645 for datasetType, originalRefs in quantum.outputs.items():
646 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
647 if ref.id is not None:
648 resolvedRefs.append(ref)
649 else:
650 unresolvedRefs.append(ref)
651 if resolvedRefs:
652 if unresolvedRefs:
653 raise OutputExistsError(
654 f"Quantum {quantum.dataId} of task with label "
655 f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
656 f"and others that don't ({unresolvedRefs})."
657 )
658 else:
659 # All outputs are already present; skip this
660 # quantum and continue to the next.
661 dataIdsToSkip.append(quantum.dataId)
662 continue
663 # Update the input DatasetRefs to the resolved ones we already
664 # searched for.
665 for datasetType, refs in quantum.inputs.items():
666 for ref in task.inputs.extract(datasetType, refs.keys()):
667 refs[ref.dataId] = ref
668 # Look up prerequisite datasets in the input collection(s).
669 # These may have dimensions that extend beyond those we queried
670 # for originally, because we want to permit those data ID
671 # values to differ across quanta and dataset types.
672 # For example, the same quantum may have a flat and bias with
673 # a different calibration_label, or a refcat with a skypix
674 # value that overlaps the quantum's data ID's region, but not
675 # the user expression used for the initial query.
676 for datasetType in task.prerequisites:
677 lookupFunction = lookupFunctions.get(datasetType.name)
678 if lookupFunction is not None:
679 refs = list(
680 lookupFunction(datasetType, registry, quantum.dataId, collections)
681 )
682 else:
683 refs = list(
684 registry.queryDatasets(
685 datasetType,
686 collections=collections,
687 dataId=quantum.dataId,
688 deduplicate=True,
689 expand=True,
690 )
691 )
692 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs})
693 # Actually remove any quanta that we decided to skip above.
694 if dataIdsToSkip:
695 _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
696 len(dataIdsToSkip), task.taskDef.label)
697 for dataId in dataIdsToSkip:
698 del task.quanta[dataId]
700 def makeQuantumGraph(self):
701 """Create a `QuantumGraph` from the quanta already present in
702 the scaffolding data structure.
704 Returns
705 -------
706 graph : `QuantumGraph`
707 The full `QuantumGraph`.
708 """
709 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
710 graph.initInputs = self.initInputs.unpackSingleRefs()
711 graph.initOutputs = self.initOutputs.unpackSingleRefs()
712 graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
713 return graph
716# ------------------------
717# Exported definitions --
718# ------------------------
721class GraphBuilderError(Exception):
722 """Base class for exceptions generated by graph builder.
723 """
724 pass
727class OutputExistsError(GraphBuilderError):
728 """Exception generated when output datasets already exist.
729 """
730 pass
733class PrerequisiteMissingError(GraphBuilderError):
734 """Exception generated when a prerequisite dataset does not exist.
735 """
736 pass
739class GraphBuilder(object):
740 """GraphBuilder class is responsible for building task execution graph from
741 a Pipeline.
743 Parameters
744 ----------
745 registry : `~lsst.daf.butler.Registry`
746 Data butler instance.
747 skipExisting : `bool`, optional
748 If `True` (default), a Quantum is not created if all its outputs
749 already exist.
750 """
752 def __init__(self, registry, skipExisting=True):
753 self.registry = registry
754 self.dimensions = registry.dimensions
755 self.skipExisting = skipExisting
757 def makeGraph(self, pipeline, collections, run, userQuery):
758 """Create execution graph for a pipeline.
760 Parameters
761 ----------
762 pipeline : `Pipeline`
763 Pipeline definition, task names/classes and their configs.
764 collections : `lsst.daf.butler.CollectionSearch`
765 Object representing the collections to search for input datasets.
766 run : `str`, optional
767 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
768 output datasets, if it already exists.
769 userQuery : `str`
770 String which defunes user-defined selection for registry, should be
771 empty or `None` if there is no restrictions on data selection.
773 Returns
774 -------
775 graph : `QuantumGraph`
777 Raises
778 ------
779 UserExpressionError
780 Raised when user expression cannot be parsed.
781 OutputExistsError
782 Raised when output datasets already exist.
783 Exception
784 Other exceptions types may be raised by underlying registry
785 classes.
786 """
787 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
788 scaffolding.connectDataIds(self.registry, collections, userQuery)
789 scaffolding.resolveDatasetRefs(self.registry, collections, run, skipExisting=self.skipExisting)
790 return scaffolding.makeQuantumGraph()