Coverage for python/lsst/pipe/base/graphBuilder.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from dataclasses import dataclass
34from typing import Dict, Iterable, Iterator, List
35import logging
37# -----------------------------
38# Imports for other modules --
39# -----------------------------
40from .connections import iterConnections
41from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
42from .graph import QuantumGraph, QuantumGraphTaskNodes
43from lsst.daf.butler import (
44 DataCoordinate,
45 DatasetRef,
46 DatasetType,
47 DimensionGraph,
48 DimensionUniverse,
49 NamedKeyDict,
50 Quantum,
51)
53# ----------------------------------
54# Local non-exported definitions --
55# ----------------------------------
57_LOG = logging.getLogger(__name__.partition(".")[2])
60class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
61 """A custom dictionary that maps `DatasetType` to a nested dictionary of
62 the known `DatasetRef` instances of that type.
64 Parameters
65 ----------
66 args
67 Positional arguments are forwarded to the `dict` constructor.
68 universe : `DimensionUniverse`
69 Universe of all possible dimensions.
70 """
71 def __init__(self, *args, universe: DimensionGraph):
72 super().__init__(*args)
73 self.universe = universe
75 @classmethod
76 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
77 universe: DimensionUniverse) -> _DatasetDict:
78 """Construct a dictionary from a flat iterable of `DatasetType` keys.
80 Parameters
81 ----------
82 datasetTypes : `iterable` of `DatasetType`
83 DatasetTypes to use as keys for the dict. Values will be empty
84 dictionaries.
85 universe : `DimensionUniverse`
86 Universe of all possible dimensions.
88 Returns
89 -------
90 dictionary : `_DatasetDict`
91 A new `_DatasetDict` instance.
92 """
93 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
95 @classmethod
96 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
97 ) -> _DatasetDict:
98 """Return a new dictionary by extracting items corresponding to the
99 given keys from one or more existing dictionaries.
101 Parameters
102 ----------
103 datasetTypes : `iterable` of `DatasetType`
104 DatasetTypes to use as keys for the dict. Values will be obtained
105 by lookups against ``first`` and ``rest``.
106 first : `_DatasetDict`
107 Another dictionary from which to extract values.
108 rest
109 Additional dictionaries from which to extract values.
111 Returns
112 -------
113 dictionary : `_DatasetDict`
114 A new dictionary instance.
115 """
116 combined = ChainMap(first, *rest)
117 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
118 universe=first.universe)
120 @property
121 def dimensions(self) -> DimensionGraph:
122 """The union of all dimensions used by all dataset types in this
123 dictionary, including implied dependencies (`DimensionGraph`).
124 """
125 base = self.universe.empty
126 if len(self) == 0:
127 return base
128 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
130 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
131 """Unpack nested single-element `DatasetRef` dicts into a new
132 mapping with `DatasetType` keys and `DatasetRef` values.
134 This method assumes that each nest contains exactly one item, as is the
135 case for all "init" datasets.
137 Returns
138 -------
139 dictionary : `NamedKeyDict`
140 Dictionary mapping `DatasetType` to `DatasetRef`, with both
141 `DatasetType` instances and string names usable as keys.
142 """
143 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
144 ref, = refs.values()
145 return ref
146 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
148 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
149 """Unpack nested multi-element `DatasetRef` dicts into a new
150 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
152 Returns
153 -------
154 dictionary : `NamedKeyDict`
155 Dictionary mapping `DatasetType` to `DatasetRef`, with both
156 `DatasetType` instances and string names usable as keys.
157 """
158 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
160 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
161 ) -> Iterator[DatasetRef]:
162 """Iterate over the contained `DatasetRef` instances that match the
163 given `DatasetType` and data IDs.
165 Parameters
166 ----------
167 datasetType : `DatasetType`
168 Dataset type to match.
169 dataIds : `Iterable` [ `DataCoordinate` ]
170 Data IDs to match.
172 Returns
173 -------
174 refs : `Iterator` [ `DatasetRef` ]
175 DatasetRef instances for which ``ref.datasetType == datasetType``
176 and ``ref.dataId`` is in ``dataIds``.
177 """
178 refs = self[datasetType]
179 return (refs[dataId] for dataId in dataIds)
182class _QuantumScaffolding:
183 """Helper class aggregating information about a `Quantum`, used when
184 constructing a `QuantumGraph`.
186 See `_PipelineScaffolding` for a top-down description of the full
187 scaffolding data structure.
189 Parameters
190 ----------
191 task : _TaskScaffolding
192 Back-reference to the helper object for the `PipelineTask` this quantum
193 represents an execution of.
194 dataId : `DataCoordinate`
195 Data ID for this quantum.
196 """
197 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
198 self.task = task
199 self.dataId = dataId
200 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
201 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
202 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
203 universe=dataId.universe)
205 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
207 def __repr__(self):
208 return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
210 task: _TaskScaffolding
211 """Back-reference to the helper object for the `PipelineTask` this quantum
212 represents an execution of.
213 """
215 dataId: DataCoordinate
216 """Data ID for this quantum.
217 """
219 inputs: _DatasetDict
220 """Nested dictionary containing `DatasetRef` inputs to this quantum.
222 This is initialized to map each `DatasetType` to an empty dictionary at
223 construction. Those nested dictionaries are populated (with data IDs as
224 keys) with unresolved `DatasetRef` instances in
225 `_PipelineScaffolding.connectDataIds`.
226 """
228 outputs: _DatasetDict
229 """Nested dictionary containing `DatasetRef` outputs this quantum.
230 """
232 prerequisites: _DatasetDict
233 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
234 quantum.
235 """
237 def makeQuantum(self) -> Quantum:
238 """Transform the scaffolding object into a true `Quantum` instance.
240 Returns
241 -------
242 quantum : `Quantum`
243 An actual `Quantum` instance.
244 """
245 allInputs = self.inputs.unpackMultiRefs()
246 allInputs.update(self.prerequisites.unpackMultiRefs())
247 # Give the task's Connections class an opportunity to remove some
248 # inputs, or complain if they are unacceptable.
249 config = self.task.taskDef.config
250 connections = config.connections.ConnectionsClass(config=config)
251 # This will raise if one of the check conditions is not met, which is the intended
252 # behavior
253 allInputs = connections.adjustQuantum(allInputs)
254 return Quantum(
255 taskName=self.task.taskDef.taskName,
256 taskClass=self.task.taskDef.taskClass,
257 dataId=self.dataId,
258 initInputs=self.task.initInputs.unpackSingleRefs(),
259 predictedInputs=allInputs,
260 outputs=self.outputs.unpackMultiRefs(),
261 )
264@dataclass
265class _TaskScaffolding:
266 """Helper class aggregating information about a `PipelineTask`, used when
267 constructing a `QuantumGraph`.
269 See `_PipelineScaffolding` for a top-down description of the full
270 scaffolding data structure.
272 Parameters
273 ----------
274 taskDef : `TaskDef`
275 Data structure that identifies the task class and its config.
276 parent : `_PipelineScaffolding`
277 The parent data structure that will hold the instance being
278 constructed.
279 datasetTypes : `TaskDatasetTypes`
280 Data structure that categorizes the dataset types used by this task.
281 """
282 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
283 universe = parent.dimensions.universe
284 self.taskDef = taskDef
285 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
286 assert self.dimensions.issubset(parent.dimensions)
287 # Initialize _DatasetDicts as subsets of the one or two
288 # corresponding dicts in the parent _PipelineScaffolding.
289 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
290 parent.initIntermediates)
291 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
292 parent.initOutputs)
293 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
294 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
295 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
296 self.dataIds = set()
297 self.quanta = {}
299 def __repr__(self):
300 # Default dataclass-injected __repr__ gets caught in an infinite loop
301 # because of back-references.
302 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
304 taskDef: TaskDef
305 """Data structure that identifies the task class and its config
306 (`TaskDef`).
307 """
309 dimensions: DimensionGraph
310 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
311 """
313 initInputs: _DatasetDict
314 """Dictionary containing information about datasets used to construct this
315 task (`_DatasetDict`).
316 """
318 initOutputs: _DatasetDict
319 """Dictionary containing information about datasets produced as a
320 side-effect of constructing this task (`_DatasetDict`).
321 """
323 inputs: _DatasetDict
324 """Dictionary containing information about datasets used as regular,
325 graph-constraining inputs to this task (`_DatasetDict`).
326 """
328 outputs: _DatasetDict
329 """Dictionary containing information about datasets produced by this task
330 (`_DatasetDict`).
331 """
333 prerequisites: _DatasetDict
334 """Dictionary containing information about input datasets that must be
335 present in the repository before any Pipeline containing this task is run
336 (`_DatasetDict`).
337 """
339 quanta: Dict[DataCoordinate, _QuantumScaffolding]
340 """Dictionary mapping data ID to a scaffolding object for the Quantum of
341 this task with that data ID.
342 """
344 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
345 """Create a `QuantumGraphTaskNodes` instance from the information in
346 ``self``.
348 Returns
349 -------
350 nodes : `QuantumGraphTaskNodes`
351 The `QuantumGraph` elements corresponding to this task.
352 """
353 return QuantumGraphTaskNodes(
354 taskDef=self.taskDef,
355 quanta=[q.makeQuantum() for q in self.quanta.values()],
356 initInputs=self.initInputs.unpackSingleRefs(),
357 initOutputs=self.initOutputs.unpackSingleRefs(),
358 )
361@dataclass
362class _PipelineScaffolding:
363 """A helper data structure that organizes the information involved in
364 constructing a `QuantumGraph` for a `Pipeline`.
366 Parameters
367 ----------
368 pipeline : `Pipeline`
369 Sequence of tasks from which a graph is to be constructed. Must
370 have nested task classes already imported.
371 universe : `DimensionUniverse`
372 Universe of all possible dimensions.
374 Notes
375 -----
376 The scaffolding data structure contains nested data structures for both
377 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
378 data structures are shared between the pipeline-level structure (which
379 aggregates all datasets and categorizes them from the perspective of the
380 complete pipeline) and the individual tasks that use them as inputs and
381 outputs.
383 `QuantumGraph` construction proceeds in four steps, with each corresponding
384 to a different `_PipelineScaffolding` method:
386 1. When `_PipelineScaffolding` is constructed, we extract and categorize
387 the DatasetTypes used by the pipeline (delegating to
388 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
389 nested `_TaskScaffolding` and `_DatasetDict` objects.
391 2. In `connectDataIds`, we construct and run the "Big Join Query", which
392 returns related tuples of all dimensions used to identify any regular
393 input, output, and intermediate datasets (not prerequisites). We then
394 iterate over these tuples of related dimensions, identifying the subsets
395 that correspond to distinct data IDs for each task and dataset type,
396 and then create `_QuantumScaffolding` objects.
398 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
399 dataset data IDs previously identified, transforming unresolved
400 DatasetRefs into resolved DatasetRefs where appropriate. We then look
401 up prerequisite datasets for all quanta.
403 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
404 per-task `_QuantumScaffolding` objects.
405 """
406 def __init__(self, pipeline, *, registry):
407 _LOG.debug("Initializing data structures for QuantumGraph generation.")
408 self.tasks = []
409 # Aggregate and categorize the DatasetTypes in the Pipeline.
410 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
411 # Construct dictionaries that map those DatasetTypes to structures
412 # that will (later) hold addiitonal information about them.
413 for attr in ("initInputs", "initIntermediates", "initOutputs",
414 "inputs", "intermediates", "outputs", "prerequisites"):
415 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
416 universe=registry.dimensions))
417 # Aggregate all dimensions for all non-init, non-prerequisite
418 # DatasetTypes. These are the ones we'll include in the big join query.
419 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
420 self.outputs.dimensions)
421 # Construct scaffolding nodes for each Task, and add backreferences
422 # to the Task from each DatasetScaffolding node.
423 # Note that there's only one scaffolding node for each DatasetType, shared by
424 # _PipelineScaffolding and all _TaskScaffoldings that reference it.
425 if isinstance(pipeline, Pipeline):
426 pipeline = pipeline.toExpandedPipeline()
427 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
428 for taskDef, taskDatasetTypes in zip(pipeline,
429 datasetTypes.byTask.values())]
431 def __repr__(self):
432 # Default dataclass-injected __repr__ gets caught in an infinite loop
433 # because of back-references.
434 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
436 tasks: List[_TaskScaffolding]
437 """Scaffolding data structures for each task in the pipeline
438 (`list` of `_TaskScaffolding`).
439 """
441 initInputs: _DatasetDict
442 """Datasets consumed but not produced when constructing the tasks in this
443 pipeline (`_DatasetDict`).
444 """
446 initIntermediates: _DatasetDict
447 """Datasets that are both consumed and produced when constructing the tasks
448 in this pipeline (`_DatasetDict`).
449 """
451 initOutputs: _DatasetDict
452 """Datasets produced but not consumed when constructing the tasks in this
453 pipeline (`_DatasetDict`).
454 """
456 inputs: _DatasetDict
457 """Datasets that are consumed but not produced when running this pipeline
458 (`_DatasetDict`).
459 """
461 intermediates: _DatasetDict
462 """Datasets that are both produced and consumed when running this pipeline
463 (`_DatasetDict`).
464 """
466 outputs: _DatasetDict
467 """Datasets produced but not consumed when when running this pipeline
468 (`_DatasetDict`).
469 """
471 prerequisites: _DatasetDict
472 """Datasets that are consumed when running this pipeline and looked up
473 per-Quantum when generating the graph (`_DatasetDict`).
474 """
476 dimensions: DimensionGraph
477 """All dimensions used by any regular input, intermediate, or output
478 (not prerequisite) dataset; the set of dimension used in the "Big Join
479 Query" (`DimensionGraph`).
481 This is required to be a superset of all task quantum dimensions.
482 """
484 def connectDataIds(self, registry, collections, userQuery):
485 """Query for the data IDs that connect nodes in the `QuantumGraph`.
487 This method populates `_TaskScaffolding.dataIds` and
488 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
490 Parameters
491 ----------
492 registry : `lsst.daf.butler.Registry`
493 Registry for the data repository; used for all data ID queries.
494 collections : `lsst.daf.butler.CollectionSearch`
495 Object representing the collections to search for input datasets.
496 userQuery : `str`, optional
497 User-provided expression to limit the data IDs processed.
498 """
499 _LOG.debug("Building query for data IDs.")
500 # Initialization datasets always have empty data IDs.
501 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
502 for datasetType, refs in itertools.chain(self.initInputs.items(),
503 self.initIntermediates.items(),
504 self.initOutputs.items()):
505 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
506 # Run one big query for the data IDs for task dimensions and regular
507 # inputs and outputs. We limit the query to only dimensions that are
508 # associated with the input dataset types, but don't (yet) try to
509 # obtain the dataset_ids for those inputs.
510 _LOG.debug("Submitting data ID query and processing results.")
511 resultIter = registry.queryDimensions(
512 self.dimensions,
513 datasets=list(self.inputs),
514 collections=collections,
515 where=userQuery,
516 )
517 # Iterate over query results, populating data IDs for datasets and
518 # quanta and then connecting them to each other.
519 n = -1 # If we had no results
520 for n, commonDataId in enumerate(resultIter):
521 # Create DatasetRefs for all DatasetTypes from this result row,
522 # noting that we might have created some already.
523 # We remember both those that already existed and those that we
524 # create now.
525 refsForRow = {}
526 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
527 self.outputs.items()):
528 datasetDataId = commonDataId.subset(datasetType.dimensions)
529 ref = refs.get(datasetDataId)
530 if ref is None:
531 ref = DatasetRef(datasetType, datasetDataId)
532 refs[datasetDataId] = ref
533 refsForRow[datasetType.name] = ref
534 # Create _QuantumScaffolding objects for all tasks from this result
535 # row, noting that we might have created some already.
536 for task in self.tasks:
537 quantumDataId = commonDataId.subset(task.dimensions)
538 quantum = task.quanta.get(quantumDataId)
539 if quantum is None:
540 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
541 task.quanta[quantumDataId] = quantum
542 # Whether this is a new quantum or an existing one, we can now
543 # associate the DatasetRefs for this row with it. The fact
544 # the fact that a Quantum data ID and a dataset data ID both
545 # came from the same result row is what tells us they should
546 # be associated.
547 # Many of these associates will be duplicates (because another
548 # query row that differed from this one only in irrelevant
549 # dimensions already added them), and we use sets to skip.
550 for datasetType in task.inputs:
551 ref = refsForRow[datasetType.name]
552 quantum.inputs[datasetType.name][ref.dataId] = ref
553 for datasetType in task.outputs:
554 ref = refsForRow[datasetType.name]
555 quantum.outputs[datasetType.name][ref.dataId] = ref
556 if n >= 0:
557 _LOG.debug("Finished processing %d rows from data ID query.", n+1)
558 else:
559 _LOG.debug("Received no rows from data ID query.")
561 def resolveDatasetRefs(self, registry, collections, run, *, skipExisting=True):
562 """Perform follow up queries for each dataset data ID produced in
563 `fillDataIds`.
565 This method populates `_DatasetScaffolding.refs` (except for those in
566 `prerequisites`).
568 Parameters
569 ----------
570 registry : `lsst.daf.butler.Registry`
571 Registry for the data repository; used for all data ID queries.
572 collections : `lsst.daf.butler.CollectionSearch`
573 Object representing the collections to search for input datasets.
574 run : `str`, optional
575 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
576 output datasets, if it already exists.
577 skipExisting : `bool`, optional
578 If `True` (default), a Quantum is not created if all its outputs
579 already exist in ``run``. Ignored if ``run`` is `None`.
581 Raises
582 ------
583 OutputExistsError
584 Raised if an output dataset already exists in the output run
585 and ``skipExisting`` is `False`. The case where some but not all
586 of a quantum's outputs are present and ``skipExisting`` is `True`
587 cannot be identified at this stage, and is handled by `fillQuanta`
588 instead.
589 """
590 # Look up [init] intermediate and output datasets in the output
591 # collection, if there is an output collection.
592 if run is not None:
593 for datasetType, refs in itertools.chain(self.initIntermediates.items(),
594 self.initOutputs.items(),
595 self.intermediates.items(),
596 self.outputs.items()):
597 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
598 len(refs), datasetType.name)
599 for dataId, unresolvedRef in refs.items():
600 # TODO: we could easily support per-DatasetType
601 # skipExisting and I could imagine that being useful - it's
602 # probably required in order to support writing initOutputs
603 # before QuantumGraph generation.
604 ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run)
605 if ref is not None:
606 if skipExisting:
607 refs[dataId] = ref
608 else:
609 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
610 f"output RUN collection '{run}' with data ID {dataId}.")
611 # Look up input and initInput datasets in the input collection(s).
612 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
613 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
614 for dataId in refs:
615 refs[dataId] = registry.findDataset(datasetType, dataId=dataId, collections=collections)
616 if any(ref is None for ref in refs.values()):
617 raise RuntimeError(
618 f"One or more dataset of type '{datasetType.name}' was "
619 f"present in a previous query, but could not be found now."
620 f"This is either a logic bug in QuantumGraph generation, "
621 f"or the input collections have been modified since "
622 f"QuantumGraph generation began."
623 )
624 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
625 # replacing the unresolved refs there, and then look up prerequisites.
626 for task in self.tasks:
627 _LOG.debug(
628 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
629 len(task.quanta),
630 task.taskDef.label
631 )
632 lookupFunctions = {
633 c.name: c.lookupFunction
634 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
635 if c.lookupFunction is not None
636 }
637 dataIdsToSkip = []
638 for quantum in task.quanta.values():
639 # Process outputs datasets only if there is a run to look for
640 # outputs in and skipExisting is True. Note that if
641 # skipExisting is False, any output datasets that already exist
642 # would have already caused an exception to be raised.
643 # We never update the DatasetRefs in the quantum because those
644 # should never be resolved.
645 if run is not None and skipExisting:
646 resolvedRefs = []
647 unresolvedRefs = []
648 for datasetType, originalRefs in quantum.outputs.items():
649 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
650 if ref.id is not None:
651 resolvedRefs.append(ref)
652 else:
653 unresolvedRefs.append(ref)
654 if resolvedRefs:
655 if unresolvedRefs:
656 raise OutputExistsError(
657 f"Quantum {quantum.dataId} of task with label "
658 f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
659 f"and others that don't ({unresolvedRefs})."
660 )
661 else:
662 # All outputs are already present; skip this
663 # quantum and continue to the next.
664 dataIdsToSkip.append(quantum.dataId)
665 continue
666 # Update the input DatasetRefs to the resolved ones we already
667 # searched for.
668 for datasetType, refs in quantum.inputs.items():
669 for ref in task.inputs.extract(datasetType, refs.keys()):
670 refs[ref.dataId] = ref
671 # Look up prerequisite datasets in the input collection(s).
672 # These may have dimensions that extend beyond those we queried
673 # for originally, because we want to permit those data ID
674 # values to differ across quanta and dataset types.
675 # For example, the same quantum may have a flat and bias with
676 # a different calibration_label, or a refcat with a skypix
677 # value that overlaps the quantum's data ID's region, but not
678 # the user expression used for the initial query.
679 for datasetType in task.prerequisites:
680 lookupFunction = lookupFunctions.get(datasetType.name)
681 if lookupFunction is not None:
682 refs = list(
683 lookupFunction(datasetType, registry, quantum.dataId, collections)
684 )
685 else:
686 refs = list(
687 registry.queryDatasets(
688 datasetType,
689 collections=collections,
690 dataId=quantum.dataId,
691 deduplicate=True,
692 expand=True,
693 )
694 )
695 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs})
696 # Actually remove any quanta that we decided to skip above.
697 if dataIdsToSkip:
698 _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
699 len(dataIdsToSkip), task.taskDef.label)
700 for dataId in dataIdsToSkip:
701 del task.quanta[dataId]
703 def makeQuantumGraph(self):
704 """Create a `QuantumGraph` from the quanta already present in
705 the scaffolding data structure.
707 Returns
708 -------
709 graph : `QuantumGraph`
710 The full `QuantumGraph`.
711 """
712 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
713 graph.initInputs = self.initInputs.unpackSingleRefs()
714 graph.initOutputs = self.initOutputs.unpackSingleRefs()
715 graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
716 return graph
719# ------------------------
720# Exported definitions --
721# ------------------------
724class GraphBuilderError(Exception):
725 """Base class for exceptions generated by graph builder.
726 """
727 pass
730class OutputExistsError(GraphBuilderError):
731 """Exception generated when output datasets already exist.
732 """
733 pass
736class PrerequisiteMissingError(GraphBuilderError):
737 """Exception generated when a prerequisite dataset does not exist.
738 """
739 pass
742class GraphBuilder(object):
743 """GraphBuilder class is responsible for building task execution graph from
744 a Pipeline.
746 Parameters
747 ----------
748 registry : `~lsst.daf.butler.Registry`
749 Data butler instance.
750 skipExisting : `bool`, optional
751 If `True` (default), a Quantum is not created if all its outputs
752 already exist.
753 """
755 def __init__(self, registry, skipExisting=True):
756 self.registry = registry
757 self.dimensions = registry.dimensions
758 self.skipExisting = skipExisting
760 def makeGraph(self, pipeline, collections, run, userQuery):
761 """Create execution graph for a pipeline.
763 Parameters
764 ----------
765 pipeline : `Pipeline`
766 Pipeline definition, task names/classes and their configs.
767 collections : `lsst.daf.butler.CollectionSearch`
768 Object representing the collections to search for input datasets.
769 run : `str`, optional
770 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
771 output datasets, if it already exists.
772 userQuery : `str`
773 String which defunes user-defined selection for registry, should be
774 empty or `None` if there is no restrictions on data selection.
776 Returns
777 -------
778 graph : `QuantumGraph`
780 Raises
781 ------
782 UserExpressionError
783 Raised when user expression cannot be parsed.
784 OutputExistsError
785 Raised when output datasets already exist.
786 Exception
787 Other exceptions types may be raised by underlying registry
788 classes.
789 """
790 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
791 scaffolding.connectDataIds(self.registry, collections, userQuery)
792 scaffolding.resolveDatasetRefs(self.registry, collections, run, skipExisting=self.skipExisting)
793 return scaffolding.makeQuantumGraph()