Coverage for python/lsst/pipe/base/graphBuilder.py: 19%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set
38from lsst.daf.butler import (
39 CollectionSearch,
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 DimensionGraph,
45 DimensionUniverse,
46 NamedKeyDict,
47 Quantum,
48)
49from lsst.utils import doImport
51from ._datasetQueryConstraints import DatasetQueryConstraintVariant
52from ._status import NoWorkFound
54# -----------------------------
55# Imports for other modules --
56# -----------------------------
57from .connections import AdjustQuantumHelper, iterConnections
58from .graph import QuantumGraph
59from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
61# ----------------------------------
62# Local non-exported definitions --
63# ----------------------------------
65_LOG = logging.getLogger(__name__)
68class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
69 """A custom dictionary that maps `DatasetType` to a nested dictionary of
70 the known `DatasetRef` instances of that type.
72 Parameters
73 ----------
74 args
75 Positional arguments are forwarded to the `dict` constructor.
76 universe : `DimensionUniverse`
77 Universe of all possible dimensions.
78 """
80 def __init__(self, *args, universe: DimensionGraph):
81 super().__init__(*args)
82 self.universe = universe
84 @classmethod
85 def fromDatasetTypes(
86 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
87 ) -> _DatasetDict:
88 """Construct a dictionary from a flat iterable of `DatasetType` keys.
90 Parameters
91 ----------
92 datasetTypes : `iterable` of `DatasetType`
93 DatasetTypes to use as keys for the dict. Values will be empty
94 dictionaries.
95 universe : `DimensionUniverse`
96 Universe of all possible dimensions.
98 Returns
99 -------
100 dictionary : `_DatasetDict`
101 A new `_DatasetDict` instance.
102 """
103 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
105 @classmethod
106 def fromSubset(
107 cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
108 ) -> _DatasetDict:
109 """Return a new dictionary by extracting items corresponding to the
110 given keys from one or more existing dictionaries.
112 Parameters
113 ----------
114 datasetTypes : `iterable` of `DatasetType`
115 DatasetTypes to use as keys for the dict. Values will be obtained
116 by lookups against ``first`` and ``rest``.
117 first : `_DatasetDict`
118 Another dictionary from which to extract values.
119 rest
120 Additional dictionaries from which to extract values.
122 Returns
123 -------
124 dictionary : `_DatasetDict`
125 A new dictionary instance.
126 """
127 combined = ChainMap(first, *rest)
128 return cls(
129 {datasetType: combined[datasetType] for datasetType in datasetTypes}, universe=first.universe
130 )
132 @property
133 def dimensions(self) -> DimensionGraph:
134 """The union of all dimensions used by all dataset types in this
135 dictionary, including implied dependencies (`DimensionGraph`).
136 """
137 base = self.universe.empty
138 if len(self) == 0:
139 return base
140 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
142 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
143 """Unpack nested single-element `DatasetRef` dicts into a new
144 mapping with `DatasetType` keys and `DatasetRef` values.
146 This method assumes that each nest contains exactly one item, as is the
147 case for all "init" datasets.
149 Returns
150 -------
151 dictionary : `NamedKeyDict`
152 Dictionary mapping `DatasetType` to `DatasetRef`, with both
153 `DatasetType` instances and string names usable as keys.
154 """
156 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
157 (ref,) = refs.values()
158 return ref
160 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
162 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
163 """Unpack nested multi-element `DatasetRef` dicts into a new
164 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
166 Returns
167 -------
168 dictionary : `NamedKeyDict`
169 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
170 both `DatasetType` instances and string names usable as keys.
171 """
172 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
174 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
175 """Iterate over the contained `DatasetRef` instances that match the
176 given `DatasetType` and data IDs.
178 Parameters
179 ----------
180 datasetType : `DatasetType`
181 Dataset type to match.
182 dataIds : `Iterable` [ `DataCoordinate` ]
183 Data IDs to match.
185 Returns
186 -------
187 refs : `Iterator` [ `DatasetRef` ]
188 DatasetRef instances for which ``ref.datasetType == datasetType``
189 and ``ref.dataId`` is in ``dataIds``.
190 """
191 refs = self[datasetType]
192 return (refs[dataId] for dataId in dataIds)
195class _QuantumScaffolding:
196 """Helper class aggregating information about a `Quantum`, used when
197 constructing a `QuantumGraph`.
199 See `_PipelineScaffolding` for a top-down description of the full
200 scaffolding data structure.
202 Parameters
203 ----------
204 task : _TaskScaffolding
205 Back-reference to the helper object for the `PipelineTask` this quantum
206 represents an execution of.
207 dataId : `DataCoordinate`
208 Data ID for this quantum.
209 """
211 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
212 self.task = task
213 self.dataId = dataId
214 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
215 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
216 self.prerequisites = _DatasetDict.fromDatasetTypes(
217 task.prerequisites.keys(), universe=dataId.universe
218 )
220 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
222 def __repr__(self):
223 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
225 task: _TaskScaffolding
226 """Back-reference to the helper object for the `PipelineTask` this quantum
227 represents an execution of.
228 """
230 dataId: DataCoordinate
231 """Data ID for this quantum.
232 """
234 inputs: _DatasetDict
235 """Nested dictionary containing `DatasetRef` inputs to this quantum.
237 This is initialized to map each `DatasetType` to an empty dictionary at
238 construction. Those nested dictionaries are populated (with data IDs as
239 keys) with unresolved `DatasetRef` instances in
240 `_PipelineScaffolding.connectDataIds`.
241 """
243 outputs: _DatasetDict
244 """Nested dictionary containing `DatasetRef` outputs this quantum.
245 """
247 prerequisites: _DatasetDict
248 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
249 quantum.
250 """
252 def makeQuantum(self) -> Quantum:
253 """Transform the scaffolding object into a true `Quantum` instance.
255 Returns
256 -------
257 quantum : `Quantum`
258 An actual `Quantum` instance.
259 """
260 allInputs = self.inputs.unpackMultiRefs()
261 allInputs.update(self.prerequisites.unpackMultiRefs())
262 # Give the task's Connections class an opportunity to remove some
263 # inputs, or complain if they are unacceptable.
264 # This will raise if one of the check conditions is not met, which is
265 # the intended behavior.
266 # If it raises NotWorkFound, there is a bug in the QG algorithm
267 # or the adjustQuantum is incorrectly trying to make a prerequisite
268 # input behave like a regular input; adjustQuantum should only raise
269 # NoWorkFound if a regular input is missing, and it shouldn't be
270 # possible for us to have generated ``self`` if that's true.
271 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
272 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
273 return Quantum(
274 taskName=self.task.taskDef.taskName,
275 taskClass=self.task.taskDef.taskClass,
276 dataId=self.dataId,
277 initInputs=self.task.initInputs.unpackSingleRefs(),
278 inputs=helper.inputs,
279 outputs=helper.outputs,
280 )
283@dataclass
284class _TaskScaffolding:
285 """Helper class aggregating information about a `PipelineTask`, used when
286 constructing a `QuantumGraph`.
288 See `_PipelineScaffolding` for a top-down description of the full
289 scaffolding data structure.
291 Parameters
292 ----------
293 taskDef : `TaskDef`
294 Data structure that identifies the task class and its config.
295 parent : `_PipelineScaffolding`
296 The parent data structure that will hold the instance being
297 constructed.
298 datasetTypes : `TaskDatasetTypes`
299 Data structure that categorizes the dataset types used by this task.
300 """
302 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
303 universe = parent.dimensions.universe
304 self.taskDef = taskDef
305 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
306 assert self.dimensions.issubset(parent.dimensions)
307 # Initialize _DatasetDicts as subsets of the one or two
308 # corresponding dicts in the parent _PipelineScaffolding.
309 self.initInputs = _DatasetDict.fromSubset(
310 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
311 )
312 self.initOutputs = _DatasetDict.fromSubset(
313 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
314 )
315 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
316 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
317 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
318 self.dataIds = set()
319 self.quanta = {}
321 def __repr__(self):
322 # Default dataclass-injected __repr__ gets caught in an infinite loop
323 # because of back-references.
324 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
326 taskDef: TaskDef
327 """Data structure that identifies the task class and its config
328 (`TaskDef`).
329 """
331 dimensions: DimensionGraph
332 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
333 """
335 initInputs: _DatasetDict
336 """Dictionary containing information about datasets used to construct this
337 task (`_DatasetDict`).
338 """
340 initOutputs: _DatasetDict
341 """Dictionary containing information about datasets produced as a
342 side-effect of constructing this task (`_DatasetDict`).
343 """
345 inputs: _DatasetDict
346 """Dictionary containing information about datasets used as regular,
347 graph-constraining inputs to this task (`_DatasetDict`).
348 """
350 outputs: _DatasetDict
351 """Dictionary containing information about datasets produced by this task
352 (`_DatasetDict`).
353 """
355 prerequisites: _DatasetDict
356 """Dictionary containing information about input datasets that must be
357 present in the repository before any Pipeline containing this task is run
358 (`_DatasetDict`).
359 """
361 quanta: Dict[DataCoordinate, _QuantumScaffolding]
362 """Dictionary mapping data ID to a scaffolding object for the Quantum of
363 this task with that data ID.
364 """
366 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]:
367 """Create a `set` of `Quantum` from the information in ``self``.
369 Returns
370 -------
371 nodes : `set` of `Quantum
372 The `Quantum` elements corresponding to this task.
373 """
374 if unresolvedRefs is None:
375 unresolvedRefs = set()
376 outputs = set()
377 for q in self.quanta.values():
378 try:
379 tmpQuanta = q.makeQuantum()
380 outputs.add(tmpQuanta)
381 except (NoWorkFound, FileNotFoundError) as exc:
382 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
383 if unresolvedRefs.intersection(refs):
384 # This means it is a node that is Known to be pruned
385 # later and should be left in even though some follow up
386 # queries fail. This allows the pruning to start from this
387 # quantum with known issues, and prune other nodes it
388 # touches
389 inputs = q.inputs.unpackMultiRefs()
390 inputs.update(q.prerequisites.unpackMultiRefs())
391 tmpQuantum = Quantum(
392 taskName=q.task.taskDef.taskName,
393 taskClass=q.task.taskDef.taskClass,
394 dataId=q.dataId,
395 initInputs=q.task.initInputs.unpackSingleRefs(),
396 inputs=inputs,
397 outputs=q.outputs.unpackMultiRefs(),
398 )
399 outputs.add(tmpQuantum)
400 else:
401 raise exc
402 return outputs
405@dataclass
406class _PipelineScaffolding:
407 """A helper data structure that organizes the information involved in
408 constructing a `QuantumGraph` for a `Pipeline`.
410 Parameters
411 ----------
412 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
413 Sequence of tasks from which a graph is to be constructed. Must
414 have nested task classes already imported.
415 universe : `DimensionUniverse`
416 Universe of all possible dimensions.
418 Notes
419 -----
420 The scaffolding data structure contains nested data structures for both
421 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
422 data structures are shared between the pipeline-level structure (which
423 aggregates all datasets and categorizes them from the perspective of the
424 complete pipeline) and the individual tasks that use them as inputs and
425 outputs.
427 `QuantumGraph` construction proceeds in four steps, with each corresponding
428 to a different `_PipelineScaffolding` method:
430 1. When `_PipelineScaffolding` is constructed, we extract and categorize
431 the DatasetTypes used by the pipeline (delegating to
432 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
433 nested `_TaskScaffolding` and `_DatasetDict` objects.
435 2. In `connectDataIds`, we construct and run the "Big Join Query", which
436 returns related tuples of all dimensions used to identify any regular
437 input, output, and intermediate datasets (not prerequisites). We then
438 iterate over these tuples of related dimensions, identifying the subsets
439 that correspond to distinct data IDs for each task and dataset type,
440 and then create `_QuantumScaffolding` objects.
442 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
443 dataset data IDs previously identified, transforming unresolved
444 DatasetRefs into resolved DatasetRefs where appropriate. We then look
445 up prerequisite datasets for all quanta.
447 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
448 per-task `_QuantumScaffolding` objects.
449 """
451 def __init__(self, pipeline, *, registry):
452 _LOG.debug("Initializing data structures for QuantumGraph generation.")
453 self.tasks = []
454 # Aggregate and categorize the DatasetTypes in the Pipeline.
455 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
456 # Construct dictionaries that map those DatasetTypes to structures
457 # that will (later) hold addiitonal information about them.
458 for attr in (
459 "initInputs",
460 "initIntermediates",
461 "initOutputs",
462 "inputs",
463 "intermediates",
464 "outputs",
465 "prerequisites",
466 ):
467 setattr(
468 self,
469 attr,
470 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
471 )
472 # Aggregate all dimensions for all non-init, non-prerequisite
473 # DatasetTypes. These are the ones we'll include in the big join
474 # query.
475 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
476 # Construct scaffolding nodes for each Task, and add backreferences
477 # to the Task from each DatasetScaffolding node.
478 # Note that there's only one scaffolding node for each DatasetType,
479 # shared by _PipelineScaffolding and all _TaskScaffoldings that
480 # reference it.
481 if isinstance(pipeline, Pipeline):
482 pipeline = pipeline.toExpandedPipeline()
483 self.tasks = [
484 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
485 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
486 ]
488 def __repr__(self):
489 # Default dataclass-injected __repr__ gets caught in an infinite loop
490 # because of back-references.
491 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
493 tasks: List[_TaskScaffolding]
494 """Scaffolding data structures for each task in the pipeline
495 (`list` of `_TaskScaffolding`).
496 """
498 initInputs: _DatasetDict
499 """Datasets consumed but not produced when constructing the tasks in this
500 pipeline (`_DatasetDict`).
501 """
503 initIntermediates: _DatasetDict
504 """Datasets that are both consumed and produced when constructing the tasks
505 in this pipeline (`_DatasetDict`).
506 """
508 initOutputs: _DatasetDict
509 """Datasets produced but not consumed when constructing the tasks in this
510 pipeline (`_DatasetDict`).
511 """
513 inputs: _DatasetDict
514 """Datasets that are consumed but not produced when running this pipeline
515 (`_DatasetDict`).
516 """
518 intermediates: _DatasetDict
519 """Datasets that are both produced and consumed when running this pipeline
520 (`_DatasetDict`).
521 """
523 outputs: _DatasetDict
524 """Datasets produced but not consumed when when running this pipeline
525 (`_DatasetDict`).
526 """
528 prerequisites: _DatasetDict
529 """Datasets that are consumed when running this pipeline and looked up
530 per-Quantum when generating the graph (`_DatasetDict`).
531 """
533 dimensions: DimensionGraph
534 """All dimensions used by any regular input, intermediate, or output
535 (not prerequisite) dataset; the set of dimension used in the "Big Join
536 Query" (`DimensionGraph`).
538 This is required to be a superset of all task quantum dimensions.
539 """
541 @contextmanager
542 def connectDataIds(
543 self,
544 registry,
545 collections,
546 userQuery,
547 externalDataId,
548 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
549 ):
550 """Query for the data IDs that connect nodes in the `QuantumGraph`.
552 This method populates `_TaskScaffolding.dataIds` and
553 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
555 Parameters
556 ----------
557 registry : `lsst.daf.butler.Registry`
558 Registry for the data repository; used for all data ID queries.
559 collections
560 Expressions representing the collections to search for input
561 datasets. May be any of the types accepted by
562 `lsst.daf.butler.CollectionSearch.fromExpression`.
563 userQuery : `str` or `None`
564 User-provided expression to limit the data IDs processed.
565 externalDataId : `DataCoordinate`
566 Externally-provided data ID that should be used to restrict the
567 results, just as if these constraints had been included via ``AND``
568 in ``userQuery``. This includes (at least) any instrument named
569 in the pipeline definition.
570 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
571 The query constraint variant that should be used to constraint the
572 query based on dataset existance, defaults to
573 `DatasetQueryConstraintVariant.ALL`.
575 Returns
576 -------
577 commonDataIds : \
578 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
579 An interface to a database temporary table containing all data IDs
580 that will appear in this `QuantumGraph`. Returned inside a
581 context manager, which will drop the temporary table at the end of
582 the `with` block in which this method is called.
583 """
584 _LOG.debug("Building query for data IDs.")
585 # Initialization datasets always have empty data IDs.
586 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
587 for datasetType, refs in itertools.chain(
588 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
589 ):
590 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
591 # Run one big query for the data IDs for task dimensions and regular
592 # inputs and outputs. We limit the query to only dimensions that are
593 # associated with the input dataset types, but don't (yet) try to
594 # obtain the dataset_ids for those inputs.
595 _LOG.debug("Submitting data ID query and materializing results.")
596 queryArgs = {"dimensions": self.dimensions, "where": userQuery, "dataId": externalDataId}
597 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
598 _LOG.debug("Constraining graph query using all datasets in pipeline.")
599 queryArgs["datasets"] = list(self.inputs)
600 queryArgs["collections"] = collections
601 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
602 _LOG.debug("Not using dataset existence to constrain query.")
603 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
604 constraint = set(datasetQueryConstraint)
605 inputs = {k.name: k for k in self.inputs.keys()}
606 if remainder := constraint.difference(inputs.keys()):
607 raise ValueError(
608 f"{remainder} dataset type(s) specified as a graph constraint, but"
609 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
610 )
611 _LOG.debug(f"Constraining graph query using {constraint}")
612 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
613 queryArgs["collections"] = collections
614 else:
615 raise ValueError(
616 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
617 )
619 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
620 _LOG.debug("Expanding data IDs.")
621 commonDataIds = commonDataIds.expanded()
622 _LOG.debug("Iterating over query results to associate quanta with datasets.")
623 # Iterate over query results, populating data IDs for datasets and
624 # quanta and then connecting them to each other.
625 n = -1
626 for n, commonDataId in enumerate(commonDataIds):
627 # Create DatasetRefs for all DatasetTypes from this result row,
628 # noting that we might have created some already.
629 # We remember both those that already existed and those that we
630 # create now.
631 refsForRow = {}
632 dataIdCacheForRow: Mapping[DimensionGraph, DataCoordinate] = {}
633 for datasetType, refs in itertools.chain(
634 self.inputs.items(), self.intermediates.items(), self.outputs.items()
635 ):
636 if not (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)):
637 datasetDataId = commonDataId.subset(datasetType.dimensions)
638 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
639 ref = refs.get(datasetDataId)
640 if ref is None:
641 ref = DatasetRef(datasetType, datasetDataId)
642 refs[datasetDataId] = ref
643 refsForRow[datasetType.name] = ref
644 # Create _QuantumScaffolding objects for all tasks from this
645 # result row, noting that we might have created some already.
646 for task in self.tasks:
647 quantumDataId = commonDataId.subset(task.dimensions)
648 quantum = task.quanta.get(quantumDataId)
649 if quantum is None:
650 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
651 task.quanta[quantumDataId] = quantum
652 # Whether this is a new quantum or an existing one, we can
653 # now associate the DatasetRefs for this row with it. The
654 # fact that a Quantum data ID and a dataset data ID both
655 # came from the same result row is what tells us they
656 # should be associated.
657 # Many of these associates will be duplicates (because
658 # another query row that differed from this one only in
659 # irrelevant dimensions already added them), and we use
660 # sets to skip.
661 for datasetType in task.inputs:
662 ref = refsForRow[datasetType.name]
663 quantum.inputs[datasetType.name][ref.dataId] = ref
664 for datasetType in task.outputs:
665 ref = refsForRow[datasetType.name]
666 quantum.outputs[datasetType.name][ref.dataId] = ref
667 if n < 0:
668 emptiness_explained = False
669 for message in commonDataIds.explain_no_results():
670 _LOG.warning(message)
671 emptiness_explained = True
672 if not emptiness_explained:
673 _LOG.warning(
674 "To reproduce this query for debugging purposes, run "
675 "Registry.queryDataIds with these arguments:"
676 )
677 # We could just repr() the queryArgs dict to get something
678 # the user could make sense of, but it's friendlier to
679 # put these args in an easier-to-construct equivalent form
680 # so they can read it more easily and copy and paste into
681 # a Python terminal.
682 _LOG.warning(" dimensions=%s,", list(queryArgs["dimensions"].names))
683 _LOG.warning(" dataId=%s,", queryArgs["dataId"].byName())
684 if queryArgs["where"]:
685 _LOG.warning(" where=%s,", repr(queryArgs["where"]))
686 if "datasets" in queryArgs:
687 _LOG.warning(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
688 if "collections" in queryArgs:
689 _LOG.warning(" collections=%s,", list(queryArgs["collections"]))
690 _LOG.debug("Finished processing %d rows from data ID query.", n)
691 yield commonDataIds
693 def resolveDatasetRefs(
694 self,
695 registry,
696 collections,
697 run,
698 commonDataIds,
699 *,
700 skipExistingIn=None,
701 clobberOutputs=True,
702 constrainedByAllDatasets: bool = True,
703 ):
704 """Perform follow up queries for each dataset data ID produced in
705 `fillDataIds`.
707 This method populates `_DatasetScaffolding.refs` (except for those in
708 `prerequisites`).
710 Parameters
711 ----------
712 registry : `lsst.daf.butler.Registry`
713 Registry for the data repository; used for all data ID queries.
714 collections
715 Expressions representing the collections to search for input
716 datasets. May be any of the types accepted by
717 `lsst.daf.butler.CollectionSearch.fromExpression`.
718 run : `str`, optional
719 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
720 output datasets, if it already exists.
721 commonDataIds : \
722 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
723 Result of a previous call to `connectDataIds`.
724 skipExistingIn
725 Expressions representing the collections to search for existing
726 output datasets that should be skipped. May be any of the types
727 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
728 `None` or empty string/sequence disables skipping.
729 clobberOutputs : `bool`, optional
730 If `True` (default), allow quanta to created even if outputs exist;
731 this requires the same behavior behavior to be enabled when
732 executing. If ``skipExistingIn`` is not `None`, completed quanta
733 (those with metadata, or all outputs if there is no metadata
734 dataset configured) will be skipped rather than clobbered.
735 constrainedByAllDatasets : `bool`, optional
736 Indicates if the commonDataIds were generated with a constraint on
737 all dataset types.
739 Raises
740 ------
741 OutputExistsError
742 Raised if an output dataset already exists in the output run
743 and ``skipExistingIn`` does not include output run, or if only
744 some outputs are present and ``clobberOutputs`` is `False`.
745 """
746 skipCollections: Optional[CollectionSearch] = None
747 skipExistingInRun = False
748 if skipExistingIn:
749 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
750 if run:
751 # as optimization check in the explicit list of names first
752 skipExistingInRun = run in skipCollections.explicitNames()
753 if not skipExistingInRun:
754 # need to flatten it and check again
755 skipExistingInRun = run in registry.queryCollections(
756 skipExistingIn,
757 collectionTypes=CollectionType.RUN,
758 )
760 # Look up [init] intermediate and output datasets in the output
761 # collection, if there is an output collection.
762 if run is not None or skipCollections is not None:
763 for datasetType, refs in itertools.chain(
764 self.initIntermediates.items(),
765 self.initOutputs.items(),
766 self.intermediates.items(),
767 self.outputs.items(),
768 ):
769 _LOG.debug(
770 "Resolving %d datasets for intermediate and/or output dataset %s.",
771 len(refs),
772 datasetType.name,
773 )
774 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
775 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
777 # look at RUN collection first
778 if run is not None:
779 resolvedRefQueryResults = subset.findDatasets(
780 datasetType, collections=run, findFirst=True
781 )
782 for resolvedRef in resolvedRefQueryResults:
783 # TODO: we could easily support per-DatasetType
784 # skipExisting and I could imagine that being useful -
785 # it's probably required in order to support writing
786 # initOutputs before QuantumGraph generation.
787 assert resolvedRef.dataId in refs
788 if not (skipExistingInRun or isInit or clobberOutputs):
789 raise OutputExistsError(
790 f"Output dataset {datasetType.name} already exists in "
791 f"output RUN collection '{run}' with data ID"
792 f" {resolvedRef.dataId}."
793 )
795 # And check skipExistingIn too, if RUN collection is in
796 # it is handled above
797 if skipCollections is not None:
798 resolvedRefQueryResults = subset.findDatasets(
799 datasetType, collections=skipCollections, findFirst=True
800 )
801 for resolvedRef in resolvedRefQueryResults:
802 assert resolvedRef.dataId in refs
803 refs[resolvedRef.dataId] = resolvedRef
805 # Look up input and initInput datasets in the input collection(s).
806 # container to accumulate unfound refs, if the common dataIs were not
807 # constrained on dataset type existence.
808 self.unfoundRefs = set()
809 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
810 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
811 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets(
812 datasetType, collections=collections, findFirst=True
813 )
814 dataIdsNotFoundYet = set(refs.keys())
815 for resolvedRef in resolvedRefQueryResults:
816 dataIdsNotFoundYet.discard(resolvedRef.dataId)
817 refs[resolvedRef.dataId] = resolvedRef
818 if dataIdsNotFoundYet:
819 if constrainedByAllDatasets:
820 raise RuntimeError(
821 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
822 f"'{datasetType.name}' was/were present in a previous "
823 f"query, but could not be found now."
824 f"This is either a logic bug in QuantumGraph generation "
825 f"or the input collections have been modified since "
826 f"QuantumGraph generation began."
827 )
828 else:
829 # if the common dataIds were not constrained using all the
830 # input dataset types, it is possible that some data ids
831 # found dont correspond to existing dataset types and they
832 # will be un-resolved. Mark these for later pruning from
833 # the quantum graph.
834 for k in dataIdsNotFoundYet:
835 self.unfoundRefs.add(refs[k])
837 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
838 # replacing the unresolved refs there, and then look up prerequisites.
839 for task in self.tasks:
840 _LOG.debug(
841 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
842 len(task.quanta),
843 task.taskDef.label,
844 )
845 lookupFunctions = {
846 c.name: c.lookupFunction
847 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
848 if c.lookupFunction is not None
849 }
850 dataIdsFailed = []
851 dataIdsSucceeded = []
852 for quantum in task.quanta.values():
853 # Process outputs datasets only if skipExistingIn is not None
854 # or there is a run to look for outputs in and clobberOutputs
855 # is True. Note that if skipExistingIn is None, any output
856 # datasets that already exist would have already caused an
857 # exception to be raised. We never update the DatasetRefs in
858 # the quantum because those should never be resolved.
859 if skipCollections is not None or (run is not None and clobberOutputs):
860 resolvedRefs = []
861 unresolvedRefs = []
862 haveMetadata = False
863 for datasetType, originalRefs in quantum.outputs.items():
864 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
865 if ref.id is not None:
866 resolvedRefs.append(ref)
867 if datasetType.name == task.taskDef.metadataDatasetName:
868 haveMetadata = True
869 else:
870 unresolvedRefs.append(ref)
871 if resolvedRefs:
872 if haveMetadata or not unresolvedRefs:
873 dataIdsSucceeded.append(quantum.dataId)
874 if skipCollections is not None:
875 continue
876 else:
877 dataIdsFailed.append(quantum.dataId)
878 if not clobberOutputs:
879 raise OutputExistsError(
880 f"Quantum {quantum.dataId} of task with label "
881 f"'{quantum.task.taskDef.label}' has some outputs that exist "
882 f"({resolvedRefs}) "
883 f"and others that don't ({unresolvedRefs}), with no metadata output, "
884 "and clobbering outputs was not enabled."
885 )
886 # Update the input DatasetRefs to the resolved ones we already
887 # searched for.
888 for datasetType, refs in quantum.inputs.items():
889 for ref in task.inputs.extract(datasetType, refs.keys()):
890 refs[ref.dataId] = ref
891 # Look up prerequisite datasets in the input collection(s).
892 # These may have dimensions that extend beyond those we queried
893 # for originally, because we want to permit those data ID
894 # values to differ across quanta and dataset types.
895 for datasetType in task.prerequisites:
896 lookupFunction = lookupFunctions.get(datasetType.name)
897 if lookupFunction is not None:
898 # PipelineTask has provided its own function to do the
899 # lookup. This always takes precedence.
900 refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
901 elif (
902 datasetType.isCalibration()
903 and datasetType.dimensions <= quantum.dataId.graph
904 and quantum.dataId.graph.temporal
905 ):
906 # This is a master calibration lookup, which we have to
907 # handle specially because the query system can't do a
908 # temporal join on a non-dimension-based timespan yet.
909 timespan = quantum.dataId.timespan
910 try:
911 refs = [
912 registry.findDataset(
913 datasetType, quantum.dataId, collections=collections, timespan=timespan
914 )
915 ]
916 except KeyError:
917 # This dataset type is not present in the registry,
918 # which just means there are no datasets here.
919 refs = []
920 else:
921 # Most general case.
922 refs = list(
923 registry.queryDatasets(
924 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True
925 ).expanded()
926 )
927 quantum.prerequisites[datasetType].update(
928 {ref.dataId: ref for ref in refs if ref is not None}
929 )
930 # Actually remove any quanta that we decided to skip above.
931 if dataIdsSucceeded:
932 if skipCollections is not None:
933 _LOG.debug(
934 "Pruning successful %d quanta for task with label '%s' because all of their "
935 "outputs exist or metadata was written successfully.",
936 len(dataIdsSucceeded),
937 task.taskDef.label,
938 )
939 for dataId in dataIdsSucceeded:
940 del task.quanta[dataId]
941 elif clobberOutputs:
942 _LOG.info(
943 "Found %d successful quanta for task with label '%s' "
944 "that will need to be clobbered during execution.",
945 len(dataIdsSucceeded),
946 task.taskDef.label,
947 )
948 else:
949 raise AssertionError("OutputExistsError should have already been raised.")
950 if dataIdsFailed:
951 if clobberOutputs:
952 _LOG.info(
953 "Found %d failed/incomplete quanta for task with label '%s' "
954 "that will need to be clobbered during execution.",
955 len(dataIdsFailed),
956 task.taskDef.label,
957 )
958 else:
959 raise AssertionError("OutputExistsError should have already been raised.")
961 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None):
962 """Create a `QuantumGraph` from the quanta already present in
963 the scaffolding data structure.
965 Parameters
966 ---------
967 metadata : Optional Mapping of `str` to primitives
968 This is an optional parameter of extra data to carry with the
969 graph. Entries in this mapping should be able to be serialized in
970 JSON.
972 Returns
973 -------
974 graph : `QuantumGraph`
975 The full `QuantumGraph`.
976 """
977 graphInput: Dict[TaskDef, Set[Quantum]] = {}
978 for task in self.tasks:
979 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs)
980 graphInput[task.taskDef] = qset
982 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs)
983 return graph
986# ------------------------
987# Exported definitions --
988# ------------------------
991class GraphBuilderError(Exception):
992 """Base class for exceptions generated by graph builder."""
994 pass
997class OutputExistsError(GraphBuilderError):
998 """Exception generated when output datasets already exist."""
1000 pass
1003class PrerequisiteMissingError(GraphBuilderError):
1004 """Exception generated when a prerequisite dataset does not exist."""
1006 pass
1009class GraphBuilder(object):
1010 """GraphBuilder class is responsible for building task execution graph from
1011 a Pipeline.
1013 Parameters
1014 ----------
1015 registry : `~lsst.daf.butler.Registry`
1016 Data butler instance.
1017 skipExistingIn
1018 Expressions representing the collections to search for existing
1019 output datasets that should be skipped. May be any of the types
1020 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
1021 clobberOutputs : `bool`, optional
1022 If `True` (default), allow quanta to created even if partial outputs
1023 exist; this requires the same behavior behavior to be enabled when
1024 executing.
1025 """
1027 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True):
1028 self.registry = registry
1029 self.dimensions = registry.dimensions
1030 self.skipExistingIn = skipExistingIn
1031 self.clobberOutputs = clobberOutputs
1033 def makeGraph(
1034 self,
1035 pipeline,
1036 collections,
1037 run,
1038 userQuery,
1039 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1040 metadata: Optional[Mapping[str, Any]] = None,
1041 ):
1042 """Create execution graph for a pipeline.
1044 Parameters
1045 ----------
1046 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1047 Pipeline definition, task names/classes and their configs.
1048 collections
1049 Expressions representing the collections to search for input
1050 datasets. May be any of the types accepted by
1051 `lsst.daf.butler.CollectionSearch.fromExpression`.
1052 run : `str`, optional
1053 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1054 output datasets, if it already exists.
1055 userQuery : `str`
1056 String which defines user-defined selection for registry, should be
1057 empty or `None` if there is no restrictions on data selection.
1058 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1059 The query constraint variant that should be used to constraint the
1060 query based on dataset existance, defaults to
1061 `DatasetQueryConstraintVariant.ALL`.
1062 metadata : Optional Mapping of `str` to primitives
1063 This is an optional parameter of extra data to carry with the
1064 graph. Entries in this mapping should be able to be serialized in
1065 JSON.
1067 Returns
1068 -------
1069 graph : `QuantumGraph`
1071 Raises
1072 ------
1073 UserExpressionError
1074 Raised when user expression cannot be parsed.
1075 OutputExistsError
1076 Raised when output datasets already exist.
1077 Exception
1078 Other exceptions types may be raised by underlying registry
1079 classes.
1080 """
1081 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1082 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1083 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1084 instrument = None
1085 if isinstance(pipeline, Pipeline):
1086 instrument = pipeline.getInstrument()
1087 if isinstance(instrument, str):
1088 instrument = doImport(instrument)
1089 pipeline = list(pipeline.toExpandedPipeline())
1090 if instrument is not None:
1091 dataId = DataCoordinate.standardize(
1092 instrument=instrument.getName(), universe=self.registry.dimensions
1093 )
1094 else:
1095 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1096 with scaffolding.connectDataIds(
1097 self.registry, collections, userQuery, dataId, datasetQueryConstraint
1098 ) as commonDataIds:
1099 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1100 scaffolding.resolveDatasetRefs(
1101 self.registry,
1102 collections,
1103 run,
1104 commonDataIds,
1105 skipExistingIn=self.skipExistingIn,
1106 clobberOutputs=self.clobberOutputs,
1107 constrainedByAllDatasets=condition,
1108 )
1109 return scaffolding.makeQuantumGraph(metadata=metadata)