Coverage for python/lsst/pipe/base/graphBuilder.py: 16%
464 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-13 16:29 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-13 16:29 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
38from lsst.daf.butler import (
39 CollectionType,
40 DataCoordinate,
41 DatasetIdGenEnum,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 NamedValueSet,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry import MissingDatasetTypeError
54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
55from lsst.daf.butler.registry.wildcards import CollectionWildcard
56from lsst.utils import doImportType
58from ._datasetQueryConstraints import DatasetQueryConstraintVariant
59from ._status import NoWorkFound
61# -----------------------------
62# Imports for other modules --
63# -----------------------------
64from .connections import AdjustQuantumHelper, iterConnections
65from .graph import QuantumGraph
66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
68# ----------------------------------
69# Local non-exported definitions --
70# ----------------------------------
72_LOG = logging.getLogger(__name__)
75class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
76 """A custom dictionary that maps `DatasetType` to a nested dictionary of
77 the known `DatasetRef` instances of that type.
79 Parameters
80 ----------
81 args
82 Positional arguments are forwarded to the `dict` constructor.
83 universe : `DimensionUniverse`
84 Universe of all possible dimensions.
85 """
87 def __init__(self, *args: Any, universe: DimensionUniverse):
88 super().__init__(*args)
89 self.universe = universe
91 @classmethod
92 def fromDatasetTypes(
93 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
94 ) -> _DatasetDict:
95 """Construct a dictionary from a flat iterable of `DatasetType` keys.
97 Parameters
98 ----------
99 datasetTypes : `iterable` of `DatasetType`
100 DatasetTypes to use as keys for the dict. Values will be empty
101 dictionaries.
102 universe : `DimensionUniverse`
103 Universe of all possible dimensions.
105 Returns
106 -------
107 dictionary : `_DatasetDict`
108 A new `_DatasetDict` instance.
109 """
110 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
112 @classmethod
113 def fromSubset(
114 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
115 ) -> _DatasetDict:
116 """Return a new dictionary by extracting items corresponding to the
117 given keys from one or more existing dictionaries.
119 Parameters
120 ----------
121 datasetTypes : `iterable` of `DatasetType`
122 DatasetTypes to use as keys for the dict. Values will be obtained
123 by lookups against ``first`` and ``rest``.
124 first : `_DatasetDict`
125 Another dictionary from which to extract values.
126 rest
127 Additional dictionaries from which to extract values.
129 Returns
130 -------
131 dictionary : `_DatasetDict`
132 A new dictionary instance.
133 """
134 combined = ChainMap(first, *rest)
136 # Dataset types known to match immediately can be processed
137 # without checks.
138 matches = combined.keys() & set(datasetTypes)
139 _dict = {k: combined[k] for k in matches}
141 if len(_dict) < len(datasetTypes):
142 # Work out which ones are missing.
143 missing_datasetTypes = set(datasetTypes) - _dict.keys()
145 # Get the known names for comparison.
146 combined_by_name = {k.name: k for k in combined}
148 missing = set()
149 incompatible = {}
150 for datasetType in missing_datasetTypes:
151 # The dataset type is not found. It may not be listed
152 # or it may be that it is there with the same name
153 # but different definition.
154 if datasetType.name in combined_by_name:
155 # This implies some inconsistency in definitions
156 # for connections. If there is support for storage
157 # class conversion we can let it slide.
158 # At this point we do not know
159 # where the inconsistency is but trust that down
160 # stream code will be more explicit about input
161 # vs output incompatibilities.
162 existing = combined_by_name[datasetType.name]
163 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
164 _LOG.warning(
165 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
166 datasetType,
167 existing,
168 )
169 _dict[datasetType] = combined[existing]
170 else:
171 incompatible[datasetType] = existing
172 else:
173 missing.add(datasetType)
175 if missing or incompatible:
176 reasons = []
177 if missing:
178 reasons.append(
179 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
180 + ", ".join(d.name for d in combined)
181 )
182 if incompatible:
183 for x, y in incompatible.items():
184 reasons.append(f"{x} incompatible with {y}")
185 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
187 return cls(_dict, universe=first.universe)
189 @property
190 def dimensions(self) -> DimensionGraph:
191 """The union of all dimensions used by all dataset types in this
192 dictionary, including implied dependencies (`DimensionGraph`).
193 """
194 base = self.universe.empty
195 if len(self) == 0:
196 return base
197 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
199 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
200 """Unpack nested single-element `DatasetRef` dicts into a new
201 mapping with `DatasetType` keys and `DatasetRef` values.
203 This method assumes that each nest contains exactly one item, as is the
204 case for all "init" datasets.
206 Returns
207 -------
208 dictionary : `NamedKeyDict`
209 Dictionary mapping `DatasetType` to `DatasetRef`, with both
210 `DatasetType` instances and string names usable as keys.
211 """
213 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
214 (ref,) = refs.values()
215 return ref
217 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
219 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
220 """Unpack nested multi-element `DatasetRef` dicts into a new
221 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
223 Returns
224 -------
225 dictionary : `NamedKeyDict`
226 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
227 both `DatasetType` instances and string names usable as keys.
228 """
229 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
231 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
232 """Iterate over the contained `DatasetRef` instances that match the
233 given `DatasetType` and data IDs.
235 Parameters
236 ----------
237 datasetType : `DatasetType`
238 Dataset type to match.
239 dataIds : `Iterable` [ `DataCoordinate` ]
240 Data IDs to match.
242 Returns
243 -------
244 refs : `Iterator` [ `DatasetRef` ]
245 DatasetRef instances for which ``ref.datasetType == datasetType``
246 and ``ref.dataId`` is in ``dataIds``.
247 """
248 refs = self[datasetType]
249 return (refs[dataId] for dataId in dataIds)
252class _QuantumScaffolding:
253 """Helper class aggregating information about a `Quantum`, used when
254 constructing a `QuantumGraph`.
256 See `_PipelineScaffolding` for a top-down description of the full
257 scaffolding data structure.
259 Parameters
260 ----------
261 task : _TaskScaffolding
262 Back-reference to the helper object for the `PipelineTask` this quantum
263 represents an execution of.
264 dataId : `DataCoordinate`
265 Data ID for this quantum.
266 """
268 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
269 self.task = task
270 self.dataId = dataId
271 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
272 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
273 self.prerequisites = _DatasetDict.fromDatasetTypes(
274 task.prerequisites.keys(), universe=dataId.universe
275 )
277 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
279 def __repr__(self) -> str:
280 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
282 task: _TaskScaffolding
283 """Back-reference to the helper object for the `PipelineTask` this quantum
284 represents an execution of.
285 """
287 dataId: DataCoordinate
288 """Data ID for this quantum.
289 """
291 inputs: _DatasetDict
292 """Nested dictionary containing `DatasetRef` inputs to this quantum.
294 This is initialized to map each `DatasetType` to an empty dictionary at
295 construction. Those nested dictionaries are populated (with data IDs as
296 keys) with unresolved `DatasetRef` instances in
297 `_PipelineScaffolding.connectDataIds`.
298 """
300 outputs: _DatasetDict
301 """Nested dictionary containing `DatasetRef` outputs this quantum.
302 """
304 prerequisites: _DatasetDict
305 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
306 quantum.
307 """
309 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
310 """Transform the scaffolding object into a true `Quantum` instance.
312 Parameters
313 ----------
314 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
315 If not `None` then fill datastore records in each generated Quantum
316 using the records from this structure.
318 Returns
319 -------
320 quantum : `Quantum`
321 An actual `Quantum` instance.
322 """
323 allInputs = self.inputs.unpackMultiRefs()
324 allInputs.update(self.prerequisites.unpackMultiRefs())
325 # Give the task's Connections class an opportunity to remove some
326 # inputs, or complain if they are unacceptable.
327 # This will raise if one of the check conditions is not met, which is
328 # the intended behavior.
329 # If it raises NotWorkFound, there is a bug in the QG algorithm
330 # or the adjustQuantum is incorrectly trying to make a prerequisite
331 # input behave like a regular input; adjustQuantum should only raise
332 # NoWorkFound if a regular input is missing, and it shouldn't be
333 # possible for us to have generated ``self`` if that's true.
334 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
335 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
336 initInputs = self.task.initInputs.unpackSingleRefs()
337 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
338 if datastore_records is not None:
339 quantum_records = {}
340 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
341 input_refs += list(initInputs.values())
342 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
343 for datastore_name, records in datastore_records.items():
344 matching_records = records.subset(input_ids)
345 if matching_records is not None:
346 quantum_records[datastore_name] = matching_records
347 return Quantum(
348 taskName=self.task.taskDef.taskName,
349 taskClass=self.task.taskDef.taskClass,
350 dataId=self.dataId,
351 initInputs=initInputs,
352 inputs=helper.inputs,
353 outputs=helper.outputs,
354 datastore_records=quantum_records,
355 )
358@dataclass
359class _TaskScaffolding:
360 """Helper class aggregating information about a `PipelineTask`, used when
361 constructing a `QuantumGraph`.
363 See `_PipelineScaffolding` for a top-down description of the full
364 scaffolding data structure.
366 Parameters
367 ----------
368 taskDef : `TaskDef`
369 Data structure that identifies the task class and its config.
370 parent : `_PipelineScaffolding`
371 The parent data structure that will hold the instance being
372 constructed.
373 datasetTypes : `TaskDatasetTypes`
374 Data structure that categorizes the dataset types used by this task.
375 """
377 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
378 universe = parent.dimensions.universe
379 self.taskDef = taskDef
380 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
381 assert self.dimensions.issubset(parent.dimensions)
382 # Initialize _DatasetDicts as subsets of the one or two
383 # corresponding dicts in the parent _PipelineScaffolding.
384 self.initInputs = _DatasetDict.fromSubset(
385 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
386 )
387 self.initOutputs = _DatasetDict.fromSubset(
388 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
389 )
390 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
391 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
392 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
393 self.dataIds: Set[DataCoordinate] = set()
394 self.quanta = {}
396 def __repr__(self) -> str:
397 # Default dataclass-injected __repr__ gets caught in an infinite loop
398 # because of back-references.
399 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
401 taskDef: TaskDef
402 """Data structure that identifies the task class and its config
403 (`TaskDef`).
404 """
406 dimensions: DimensionGraph
407 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
408 """
410 initInputs: _DatasetDict
411 """Dictionary containing information about datasets used to construct this
412 task (`_DatasetDict`).
413 """
415 initOutputs: _DatasetDict
416 """Dictionary containing information about datasets produced as a
417 side-effect of constructing this task (`_DatasetDict`).
418 """
420 inputs: _DatasetDict
421 """Dictionary containing information about datasets used as regular,
422 graph-constraining inputs to this task (`_DatasetDict`).
423 """
425 outputs: _DatasetDict
426 """Dictionary containing information about datasets produced by this task
427 (`_DatasetDict`).
428 """
430 prerequisites: _DatasetDict
431 """Dictionary containing information about input datasets that must be
432 present in the repository before any Pipeline containing this task is run
433 (`_DatasetDict`).
434 """
436 quanta: Dict[DataCoordinate, _QuantumScaffolding]
437 """Dictionary mapping data ID to a scaffolding object for the Quantum of
438 this task with that data ID.
439 """
441 def makeQuantumSet(
442 self,
443 unresolvedRefs: Optional[Set[DatasetRef]] = None,
444 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
445 ) -> Set[Quantum]:
446 """Create a `set` of `Quantum` from the information in ``self``.
448 Parameters
449 ----------
450 unresolvedRefs : `set` [ `DatasetRef` ], optional
451 Input dataset refs that have not been found.
452 datastore_records : `dict`
455 Returns
456 -------
457 nodes : `set` of `Quantum`
458 The `Quantum` elements corresponding to this task.
459 """
460 if unresolvedRefs is None:
461 unresolvedRefs = set()
462 outputs = set()
463 for q in self.quanta.values():
464 try:
465 tmpQuanta = q.makeQuantum(datastore_records)
466 outputs.add(tmpQuanta)
467 except (NoWorkFound, FileNotFoundError) as exc:
468 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
469 if unresolvedRefs.intersection(refs):
470 # This means it is a node that is Known to be pruned
471 # later and should be left in even though some follow up
472 # queries fail. This allows the pruning to start from this
473 # quantum with known issues, and prune other nodes it
474 # touches
475 inputs = q.inputs.unpackMultiRefs()
476 inputs.update(q.prerequisites.unpackMultiRefs())
477 tmpQuantum = Quantum(
478 taskName=q.task.taskDef.taskName,
479 taskClass=q.task.taskDef.taskClass,
480 dataId=q.dataId,
481 initInputs=q.task.initInputs.unpackSingleRefs(),
482 inputs=inputs,
483 outputs=q.outputs.unpackMultiRefs(),
484 )
485 outputs.add(tmpQuantum)
486 else:
487 raise exc
488 return outputs
491class _DatasetIdMaker:
492 """Helper class which generates random dataset UUIDs for unresolved
493 datasets.
494 """
496 def __init__(self, registry: Registry, run: str):
497 self.datasetIdFactory = registry.datasetIdFactory
498 self.run = run
499 # Dataset IDs generated so far
500 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {}
502 def resolveRef(self, ref: DatasetRef) -> DatasetRef:
503 if ref.id is not None:
504 return ref
505 key = ref.datasetType, ref.dataId
506 if (resolved := self.resolved.get(key)) is None:
507 datasetId = self.datasetIdFactory.makeDatasetId(
508 self.run, ref.datasetType, ref.dataId, DatasetIdGenEnum.UNIQUE
509 )
510 resolved = ref.resolved(datasetId, self.run)
511 self.resolved[key] = resolved
512 return resolved
514 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]:
515 """Resolve all unresolved references in the provided dictionary."""
516 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()}
519@dataclass
520class _PipelineScaffolding:
521 """A helper data structure that organizes the information involved in
522 constructing a `QuantumGraph` for a `Pipeline`.
524 Parameters
525 ----------
526 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
527 Sequence of tasks from which a graph is to be constructed. Must
528 have nested task classes already imported.
529 universe : `DimensionUniverse`
530 Universe of all possible dimensions.
532 Notes
533 -----
534 The scaffolding data structure contains nested data structures for both
535 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
536 data structures are shared between the pipeline-level structure (which
537 aggregates all datasets and categorizes them from the perspective of the
538 complete pipeline) and the individual tasks that use them as inputs and
539 outputs.
541 `QuantumGraph` construction proceeds in four steps, with each corresponding
542 to a different `_PipelineScaffolding` method:
544 1. When `_PipelineScaffolding` is constructed, we extract and categorize
545 the DatasetTypes used by the pipeline (delegating to
546 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
547 nested `_TaskScaffolding` and `_DatasetDict` objects.
549 2. In `connectDataIds`, we construct and run the "Big Join Query", which
550 returns related tuples of all dimensions used to identify any regular
551 input, output, and intermediate datasets (not prerequisites). We then
552 iterate over these tuples of related dimensions, identifying the subsets
553 that correspond to distinct data IDs for each task and dataset type,
554 and then create `_QuantumScaffolding` objects.
556 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
557 dataset data IDs previously identified, transforming unresolved
558 DatasetRefs into resolved DatasetRefs where appropriate. We then look
559 up prerequisite datasets for all quanta.
561 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
562 per-task `_QuantumScaffolding` objects.
563 """
565 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
566 _LOG.debug("Initializing data structures for QuantumGraph generation.")
567 self.tasks = []
568 # Aggregate and categorize the DatasetTypes in the Pipeline.
569 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
570 # Construct dictionaries that map those DatasetTypes to structures
571 # that will (later) hold addiitonal information about them.
572 for attr in (
573 "initInputs",
574 "initIntermediates",
575 "initOutputs",
576 "inputs",
577 "intermediates",
578 "outputs",
579 "prerequisites",
580 ):
581 setattr(
582 self,
583 attr,
584 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
585 )
586 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
587 # Aggregate all dimensions for all non-init, non-prerequisite
588 # DatasetTypes. These are the ones we'll include in the big join
589 # query.
590 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
591 # Construct scaffolding nodes for each Task, and add backreferences
592 # to the Task from each DatasetScaffolding node.
593 # Note that there's only one scaffolding node for each DatasetType,
594 # shared by _PipelineScaffolding and all _TaskScaffoldings that
595 # reference it.
596 if isinstance(pipeline, Pipeline):
597 pipeline = pipeline.toExpandedPipeline()
598 self.tasks = [
599 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
600 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
601 ]
603 def __repr__(self) -> str:
604 # Default dataclass-injected __repr__ gets caught in an infinite loop
605 # because of back-references.
606 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
608 tasks: List[_TaskScaffolding]
609 """Scaffolding data structures for each task in the pipeline
610 (`list` of `_TaskScaffolding`).
611 """
613 initInputs: _DatasetDict
614 """Datasets consumed but not produced when constructing the tasks in this
615 pipeline (`_DatasetDict`).
616 """
618 initIntermediates: _DatasetDict
619 """Datasets that are both consumed and produced when constructing the tasks
620 in this pipeline (`_DatasetDict`).
621 """
623 initOutputs: _DatasetDict
624 """Datasets produced but not consumed when constructing the tasks in this
625 pipeline (`_DatasetDict`).
626 """
628 inputs: _DatasetDict
629 """Datasets that are consumed but not produced when running this pipeline
630 (`_DatasetDict`).
631 """
633 intermediates: _DatasetDict
634 """Datasets that are both produced and consumed when running this pipeline
635 (`_DatasetDict`).
636 """
638 outputs: _DatasetDict
639 """Datasets produced but not consumed when when running this pipeline
640 (`_DatasetDict`).
641 """
643 prerequisites: _DatasetDict
644 """Datasets that are consumed when running this pipeline and looked up
645 per-Quantum when generating the graph (`_DatasetDict`).
646 """
648 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
649 """Datasets that should be used as constraints in the initial query,
650 according to tasks (`NamedValueSet`).
651 """
653 dimensions: DimensionGraph
654 """All dimensions used by any regular input, intermediate, or output
655 (not prerequisite) dataset; the set of dimension used in the "Big Join
656 Query" (`DimensionGraph`).
658 This is required to be a superset of all task quantum dimensions.
659 """
661 @contextmanager
662 def connectDataIds(
663 self,
664 registry: Registry,
665 collections: Any,
666 userQuery: Optional[str],
667 externalDataId: DataCoordinate,
668 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
669 bind: Optional[Mapping[str, Any]] = None,
670 ) -> Iterator[DataCoordinateQueryResults]:
671 """Query for the data IDs that connect nodes in the `QuantumGraph`.
673 This method populates `_TaskScaffolding.dataIds` and
674 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
676 Parameters
677 ----------
678 registry : `lsst.daf.butler.Registry`
679 Registry for the data repository; used for all data ID queries.
680 collections
681 Expressions representing the collections to search for input
682 datasets. See :ref:`daf_butler_ordered_collection_searches`.
683 userQuery : `str` or `None`
684 User-provided expression to limit the data IDs processed.
685 externalDataId : `DataCoordinate`
686 Externally-provided data ID that should be used to restrict the
687 results, just as if these constraints had been included via ``AND``
688 in ``userQuery``. This includes (at least) any instrument named
689 in the pipeline definition.
690 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
691 The query constraint variant that should be used to constraint the
692 query based on dataset existance, defaults to
693 `DatasetQueryConstraintVariant.ALL`.
694 bind : `Mapping`, optional
695 Mapping containing literal values that should be injected into the
696 ``userQuery`` expression, keyed by the identifiers they replace.
698 Returns
699 -------
700 commonDataIds : \
701 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
702 An interface to a database temporary table containing all data IDs
703 that will appear in this `QuantumGraph`. Returned inside a
704 context manager, which will drop the temporary table at the end of
705 the `with` block in which this method is called.
706 """
707 _LOG.debug("Building query for data IDs.")
708 # Initialization datasets always have empty data IDs.
709 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
710 for datasetType, refs in itertools.chain(
711 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
712 ):
713 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
714 # Run one big query for the data IDs for task dimensions and regular
715 # inputs and outputs. We limit the query to only dimensions that are
716 # associated with the input dataset types, but don't (yet) try to
717 # obtain the dataset_ids for those inputs.
718 _LOG.debug(
719 "Submitting data ID query over dimensions %s and materializing results.",
720 list(self.dimensions.names),
721 )
722 queryArgs: Dict[str, Any] = {
723 "dimensions": self.dimensions,
724 "where": userQuery,
725 "dataId": externalDataId,
726 "bind": bind,
727 }
728 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
729 _LOG.debug(
730 "Constraining graph query using default of %s.",
731 list(self.defaultDatasetQueryConstraints.names),
732 )
733 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
734 queryArgs["collections"] = collections
735 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
736 _LOG.debug("Not using dataset existence to constrain query.")
737 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
738 constraint = set(datasetQueryConstraint)
739 inputs = {k.name: k for k in self.inputs.keys()}
740 if remainder := constraint.difference(inputs.keys()):
741 raise ValueError(
742 f"{remainder} dataset type(s) specified as a graph constraint, but"
743 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
744 )
745 _LOG.debug(f"Constraining graph query using {constraint}")
746 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
747 queryArgs["collections"] = collections
748 else:
749 raise ValueError(
750 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
751 )
753 if "datasets" in queryArgs:
754 for i, dataset_type in enumerate(queryArgs["datasets"]):
755 if dataset_type.isComponent():
756 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
758 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
759 _LOG.debug("Expanding data IDs.")
760 commonDataIds = commonDataIds.expanded()
761 _LOG.debug("Iterating over query results to associate quanta with datasets.")
762 # Iterate over query results, populating data IDs for datasets and
763 # quanta and then connecting them to each other.
764 n = -1
765 for n, commonDataId in enumerate(commonDataIds):
766 # Create DatasetRefs for all DatasetTypes from this result row,
767 # noting that we might have created some already.
768 # We remember both those that already existed and those that we
769 # create now.
770 refsForRow = {}
771 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
772 for datasetType, refs in itertools.chain(
773 self.inputs.items(), self.intermediates.items(), self.outputs.items()
774 ):
775 datasetDataId: Optional[DataCoordinate]
776 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
777 datasetDataId = commonDataId.subset(datasetType.dimensions)
778 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
779 ref = refs.get(datasetDataId)
780 if ref is None:
781 ref = DatasetRef(datasetType, datasetDataId)
782 refs[datasetDataId] = ref
783 refsForRow[datasetType.name] = ref
784 # Create _QuantumScaffolding objects for all tasks from this
785 # result row, noting that we might have created some already.
786 for task in self.tasks:
787 quantumDataId = commonDataId.subset(task.dimensions)
788 quantum = task.quanta.get(quantumDataId)
789 if quantum is None:
790 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
791 task.quanta[quantumDataId] = quantum
792 # Whether this is a new quantum or an existing one, we can
793 # now associate the DatasetRefs for this row with it. The
794 # fact that a Quantum data ID and a dataset data ID both
795 # came from the same result row is what tells us they
796 # should be associated.
797 # Many of these associates will be duplicates (because
798 # another query row that differed from this one only in
799 # irrelevant dimensions already added them), and we use
800 # sets to skip.
801 for datasetType in task.inputs:
802 ref = refsForRow[datasetType.name]
803 quantum.inputs[datasetType.name][ref.dataId] = ref
804 for datasetType in task.outputs:
805 ref = refsForRow[datasetType.name]
806 quantum.outputs[datasetType.name][ref.dataId] = ref
807 if n < 0:
808 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
809 emptiness_explained = False
810 for message in commonDataIds.explain_no_results():
811 _LOG.critical(message)
812 emptiness_explained = True
813 if not emptiness_explained:
814 _LOG.critical(
815 "To reproduce this query for debugging purposes, run "
816 "Registry.queryDataIds with these arguments:"
817 )
818 # We could just repr() the queryArgs dict to get something
819 # the user could make sense of, but it's friendlier to
820 # put these args in an easier-to-construct equivalent form
821 # so they can read it more easily and copy and paste into
822 # a Python terminal.
823 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
824 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
825 if queryArgs["where"]:
826 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
827 if "datasets" in queryArgs:
828 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
829 if "collections" in queryArgs:
830 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
831 _LOG.debug("Finished processing %d rows from data ID query.", n)
832 yield commonDataIds
834 def resolveDatasetRefs(
835 self,
836 registry: Registry,
837 collections: Any,
838 run: Optional[str],
839 commonDataIds: DataCoordinateQueryResults,
840 *,
841 skipExistingIn: Any = None,
842 clobberOutputs: bool = True,
843 constrainedByAllDatasets: bool = True,
844 resolveRefs: bool = False,
845 ) -> None:
846 """Perform follow up queries for each dataset data ID produced in
847 `fillDataIds`.
849 This method populates `_DatasetScaffolding.refs` (except for those in
850 `prerequisites`).
852 Parameters
853 ----------
854 registry : `lsst.daf.butler.Registry`
855 Registry for the data repository; used for all data ID queries.
856 collections
857 Expressions representing the collections to search for input
858 datasets. See :ref:`daf_butler_ordered_collection_searches`.
859 run : `str`, optional
860 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
861 output datasets, if it already exists.
862 commonDataIds : \
863 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
864 Result of a previous call to `connectDataIds`.
865 skipExistingIn
866 Expressions representing the collections to search for existing
867 output datasets that should be skipped. See
868 :ref:`daf_butler_ordered_collection_searches` for allowed types.
869 `None` or empty string/sequence disables skipping.
870 clobberOutputs : `bool`, optional
871 If `True` (default), allow quanta to created even if outputs exist;
872 this requires the same behavior behavior to be enabled when
873 executing. If ``skipExistingIn`` is not `None`, completed quanta
874 (those with metadata, or all outputs if there is no metadata
875 dataset configured) will be skipped rather than clobbered.
876 constrainedByAllDatasets : `bool`, optional
877 Indicates if the commonDataIds were generated with a constraint on
878 all dataset types.
879 resolveRefs : `bool`, optional
880 If `True` then resolve all input references and generate random
881 dataset IDs for all output and intermediate datasets. True value
882 requires ``run`` collection to be specified.
884 Raises
885 ------
886 OutputExistsError
887 Raised if an output dataset already exists in the output run
888 and ``skipExistingIn`` does not include output run, or if only
889 some outputs are present and ``clobberOutputs`` is `False`.
890 """
891 skip_collections_wildcard: CollectionWildcard | None = None
892 skipExistingInRun = False
893 if skipExistingIn:
894 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
895 if run:
896 # as optimization check in the explicit list of names first
897 skipExistingInRun = run in skip_collections_wildcard.strings
898 if not skipExistingInRun:
899 # need to flatten it and check again
900 skipExistingInRun = run in registry.queryCollections(
901 skipExistingIn,
902 collectionTypes=CollectionType.RUN,
903 )
905 idMaker: Optional[_DatasetIdMaker] = None
906 if resolveRefs:
907 assert run is not None, "run cannot be None when resolveRefs is True"
908 idMaker = _DatasetIdMaker(registry, run)
910 resolvedRefQueryResults: Iterable[DatasetRef]
912 # Updating constrainedByAllDatasets here is not ideal, but we have a
913 # few different code paths that each transfer different pieces of
914 # information about what dataset query constraints were applied here,
915 # and none of them has the complete picture until we get here. We're
916 # long overdue for a QG generation rewrite that will make this go away
917 # entirely anyway.
918 constrainedByAllDatasets = (
919 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
920 )
922 # Look up [init] intermediate and output datasets in the output
923 # collection, if there is an output collection.
924 if run is not None or skip_collections_wildcard is not None:
925 for datasetType, refs in itertools.chain(
926 self.initIntermediates.items(),
927 self.initOutputs.items(),
928 self.intermediates.items(),
929 self.outputs.items(),
930 ):
931 _LOG.debug(
932 "Resolving %d datasets for intermediate and/or output dataset %s.",
933 len(refs),
934 datasetType.name,
935 )
936 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
937 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
938 # TODO: this assert incorrectly bans component inputs;
939 # investigate on DM-33027.
940 # assert not datasetType.isComponent(), \
941 # "Output datasets cannot be components."
942 #
943 # Instead we have to handle them manually to avoid a
944 # deprecation warning, but it is at least confusing and
945 # possibly a bug for components to appear here at all.
946 if datasetType.isComponent():
947 parent_dataset_type = datasetType.makeCompositeDatasetType()
948 component = datasetType.component()
949 else:
950 parent_dataset_type = datasetType
951 component = None
953 # look at RUN collection first
954 if run is not None:
955 try:
956 resolvedRefQueryResults = subset.findDatasets(
957 parent_dataset_type, collections=run, findFirst=True
958 )
959 except MissingDatasetTypeError:
960 resolvedRefQueryResults = []
961 for resolvedRef in resolvedRefQueryResults:
962 # TODO: we could easily support per-DatasetType
963 # skipExisting and I could imagine that being useful -
964 # it's probably required in order to support writing
965 # initOutputs before QuantumGraph generation.
966 assert resolvedRef.dataId in refs
967 if not (skipExistingInRun or isInit or clobberOutputs):
968 raise OutputExistsError(
969 f"Output dataset {datasetType.name} already exists in "
970 f"output RUN collection '{run}' with data ID"
971 f" {resolvedRef.dataId}."
972 )
973 # If we are going to resolve all outputs then we have
974 # to remember existing ones to avoid generating new
975 # dataset IDs for them.
976 if resolveRefs:
977 refs[resolvedRef.dataId] = (
978 resolvedRef.makeComponentRef(component)
979 if component is not None
980 else resolvedRef
981 )
983 # And check skipExistingIn too, if RUN collection is in
984 # it is handled above
985 if skip_collections_wildcard is not None:
986 try:
987 resolvedRefQueryResults = subset.findDatasets(
988 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True
989 )
990 except MissingDatasetTypeError:
991 resolvedRefQueryResults = []
992 for resolvedRef in resolvedRefQueryResults:
993 assert resolvedRef.dataId in refs
994 refs[resolvedRef.dataId] = (
995 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
996 )
998 # Look up input and initInput datasets in the input collection(s).
999 # container to accumulate unfound refs, if the common dataIs were not
1000 # constrained on dataset type existence.
1001 self.unfoundRefs = set()
1002 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1003 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
1004 if datasetType.isComponent():
1005 parent_dataset_type = datasetType.makeCompositeDatasetType()
1006 component = datasetType.component()
1007 else:
1008 parent_dataset_type = datasetType
1009 component = None
1010 try:
1011 resolvedRefQueryResults = commonDataIds.subset(
1012 datasetType.dimensions, unique=True
1013 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1014 except MissingDatasetTypeError:
1015 resolvedRefQueryResults = []
1016 dataIdsNotFoundYet = set(refs.keys())
1017 for resolvedRef in resolvedRefQueryResults:
1018 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1019 refs[resolvedRef.dataId] = (
1020 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1021 )
1022 if dataIdsNotFoundYet:
1023 if constrainedByAllDatasets:
1024 raise RuntimeError(
1025 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1026 f"'{datasetType.name}' was/were present in a previous "
1027 f"query, but could not be found now."
1028 f"This is either a logic bug in QuantumGraph generation "
1029 f"or the input collections have been modified since "
1030 f"QuantumGraph generation began."
1031 )
1032 elif not datasetType.dimensions:
1033 raise RuntimeError(
1034 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1035 f"collections {collections}."
1036 )
1037 else:
1038 # if the common dataIds were not constrained using all the
1039 # input dataset types, it is possible that some data ids
1040 # found dont correspond to existing dataset types and they
1041 # will be un-resolved. Mark these for later pruning from
1042 # the quantum graph.
1043 for k in dataIdsNotFoundYet:
1044 self.unfoundRefs.add(refs[k])
1046 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1047 # replacing the unresolved refs there, and then look up prerequisites.
1048 for task in self.tasks:
1049 _LOG.debug(
1050 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1051 len(task.quanta),
1052 task.taskDef.label,
1053 )
1054 # The way iterConnections is designed makes it impossible to
1055 # annotate precisely enough to satisfy MyPy here.
1056 lookupFunctions = {
1057 c.name: c.lookupFunction # type: ignore
1058 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1059 if c.lookupFunction is not None # type: ignore
1060 }
1061 dataIdsFailed = []
1062 dataIdsSucceeded = []
1063 for quantum in task.quanta.values():
1064 # Process outputs datasets only if skipExistingIn is not None
1065 # or there is a run to look for outputs in and clobberOutputs
1066 # is True. Note that if skipExistingIn is None, any output
1067 # datasets that already exist would have already caused an
1068 # exception to be raised. We never update the DatasetRefs in
1069 # the quantum because those should never be resolved.
1070 if skip_collections_wildcard is not None or (run is not None and clobberOutputs):
1071 resolvedRefs = []
1072 unresolvedRefs = []
1073 haveMetadata = False
1074 for datasetType, originalRefs in quantum.outputs.items():
1075 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
1076 if ref.id is not None:
1077 resolvedRefs.append(ref)
1078 if datasetType.name == task.taskDef.metadataDatasetName:
1079 haveMetadata = True
1080 else:
1081 unresolvedRefs.append(ref)
1082 if resolvedRefs:
1083 if haveMetadata or not unresolvedRefs:
1084 dataIdsSucceeded.append(quantum.dataId)
1085 if skip_collections_wildcard is not None:
1086 continue
1087 else:
1088 dataIdsFailed.append(quantum.dataId)
1089 if not clobberOutputs:
1090 raise OutputExistsError(
1091 f"Quantum {quantum.dataId} of task with label "
1092 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1093 f"({resolvedRefs}) "
1094 f"and others that don't ({unresolvedRefs}), with no metadata output, "
1095 "and clobbering outputs was not enabled."
1096 )
1097 # Update the input DatasetRefs to the resolved ones we already
1098 # searched for.
1099 for datasetType, input_refs in quantum.inputs.items():
1100 for ref in task.inputs.extract(datasetType, input_refs.keys()):
1101 input_refs[ref.dataId] = ref
1102 # Look up prerequisite datasets in the input collection(s).
1103 # These may have dimensions that extend beyond those we queried
1104 # for originally, because we want to permit those data ID
1105 # values to differ across quanta and dataset types.
1106 for datasetType in task.prerequisites:
1107 if datasetType.isComponent():
1108 parent_dataset_type = datasetType.makeCompositeDatasetType()
1109 component = datasetType.component()
1110 else:
1111 parent_dataset_type = datasetType
1112 component = None
1113 lookupFunction = lookupFunctions.get(datasetType.name)
1114 if lookupFunction is not None:
1115 # PipelineTask has provided its own function to do the
1116 # lookup. This always takes precedence.
1117 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1118 elif (
1119 datasetType.isCalibration()
1120 and datasetType.dimensions <= quantum.dataId.graph
1121 and quantum.dataId.graph.temporal
1122 ):
1123 # This is a master calibration lookup, which we have to
1124 # handle specially because the query system can't do a
1125 # temporal join on a non-dimension-based timespan yet.
1126 timespan = quantum.dataId.timespan
1127 try:
1128 prereq_ref = registry.findDataset(
1129 parent_dataset_type,
1130 quantum.dataId,
1131 collections=collections,
1132 timespan=timespan,
1133 )
1134 if prereq_ref is not None:
1135 if component is not None:
1136 prereq_ref = prereq_ref.makeComponentRef(component)
1137 prereq_refs = [prereq_ref]
1138 else:
1139 prereq_refs = []
1140 except (KeyError, MissingDatasetTypeError):
1141 # This dataset type is not present in the registry,
1142 # which just means there are no datasets here.
1143 prereq_refs = []
1144 else:
1145 # Most general case.
1146 prereq_refs = [
1147 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1148 for prereq_ref in registry.queryDatasets(
1149 parent_dataset_type,
1150 collections=collections,
1151 dataId=quantum.dataId,
1152 findFirst=True,
1153 ).expanded()
1154 ]
1155 quantum.prerequisites[datasetType].update(
1156 {ref.dataId: ref for ref in prereq_refs if ref is not None}
1157 )
1159 # Resolve all quantum inputs and outputs.
1160 if idMaker:
1161 for datasetDict in (quantum.inputs, quantum.outputs):
1162 for refDict in datasetDict.values():
1163 refDict.update(idMaker.resolveDict(refDict))
1165 # Resolve task initInputs and initOutputs.
1166 if idMaker:
1167 for datasetDict in (task.initInputs, task.initOutputs):
1168 for refDict in datasetDict.values():
1169 refDict.update(idMaker.resolveDict(refDict))
1171 # Actually remove any quanta that we decided to skip above.
1172 if dataIdsSucceeded:
1173 if skip_collections_wildcard is not None:
1174 _LOG.debug(
1175 "Pruning successful %d quanta for task with label '%s' because all of their "
1176 "outputs exist or metadata was written successfully.",
1177 len(dataIdsSucceeded),
1178 task.taskDef.label,
1179 )
1180 for dataId in dataIdsSucceeded:
1181 del task.quanta[dataId]
1182 elif clobberOutputs:
1183 _LOG.info(
1184 "Found %d successful quanta for task with label '%s' "
1185 "that will need to be clobbered during execution.",
1186 len(dataIdsSucceeded),
1187 task.taskDef.label,
1188 )
1189 else:
1190 raise AssertionError("OutputExistsError should have already been raised.")
1191 if dataIdsFailed:
1192 if clobberOutputs:
1193 _LOG.info(
1194 "Found %d failed/incomplete quanta for task with label '%s' "
1195 "that will need to be clobbered during execution.",
1196 len(dataIdsFailed),
1197 task.taskDef.label,
1198 )
1199 else:
1200 raise AssertionError("OutputExistsError should have already been raised.")
1202 def makeQuantumGraph(
1203 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None
1204 ) -> QuantumGraph:
1205 """Create a `QuantumGraph` from the quanta already present in
1206 the scaffolding data structure.
1208 Parameters
1209 ---------
1210 metadata : Optional Mapping of `str` to primitives
1211 This is an optional parameter of extra data to carry with the
1212 graph. Entries in this mapping should be able to be serialized in
1213 JSON.
1214 datastore : `Datastore`, optional
1215 If not `None` then fill datastore records in each generated
1216 Quantum.
1218 Returns
1219 -------
1220 graph : `QuantumGraph`
1221 The full `QuantumGraph`.
1222 """
1224 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1225 """Extract all DatasetRefs from the dictionaries"""
1226 for ref_dict in dataset_dict.values():
1227 yield from ref_dict.values()
1229 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1230 if datastore is not None:
1231 datastore_records = datastore.export_records(
1232 itertools.chain(
1233 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1234 )
1235 )
1237 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1238 for task in self.tasks:
1239 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1240 graphInput[task.taskDef] = qset
1242 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks}
1243 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks}
1245 graph = QuantumGraph(
1246 graphInput,
1247 metadata=metadata,
1248 pruneRefs=self.unfoundRefs,
1249 universe=self.dimensions.universe,
1250 initInputs=taskInitInputs,
1251 initOutputs=taskInitOutputs,
1252 )
1253 return graph
1256# ------------------------
1257# Exported definitions --
1258# ------------------------
1261class GraphBuilderError(Exception):
1262 """Base class for exceptions generated by graph builder."""
1264 pass
1267class OutputExistsError(GraphBuilderError):
1268 """Exception generated when output datasets already exist."""
1270 pass
1273class PrerequisiteMissingError(GraphBuilderError):
1274 """Exception generated when a prerequisite dataset does not exist."""
1276 pass
1279class GraphBuilder:
1280 """GraphBuilder class is responsible for building task execution graph from
1281 a Pipeline.
1283 Parameters
1284 ----------
1285 registry : `~lsst.daf.butler.Registry`
1286 Data butler instance.
1287 skipExistingIn
1288 Expressions representing the collections to search for existing
1289 output datasets that should be skipped. See
1290 :ref:`daf_butler_ordered_collection_searches`.
1291 clobberOutputs : `bool`, optional
1292 If `True` (default), allow quanta to created even if partial outputs
1293 exist; this requires the same behavior behavior to be enabled when
1294 executing.
1295 datastore : `Datastore`, optional
1296 If not `None` then fill datastore records in each generated Quantum.
1297 """
1299 def __init__(
1300 self,
1301 registry: Registry,
1302 skipExistingIn: Any = None,
1303 clobberOutputs: bool = True,
1304 datastore: Optional[Datastore] = None,
1305 ):
1306 self.registry = registry
1307 self.dimensions = registry.dimensions
1308 self.skipExistingIn = skipExistingIn
1309 self.clobberOutputs = clobberOutputs
1310 self.datastore = datastore
1312 def makeGraph(
1313 self,
1314 pipeline: Union[Pipeline, Iterable[TaskDef]],
1315 collections: Any,
1316 run: Optional[str],
1317 userQuery: Optional[str],
1318 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1319 metadata: Optional[Mapping[str, Any]] = None,
1320 resolveRefs: bool = False,
1321 bind: Optional[Mapping[str, Any]] = None,
1322 ) -> QuantumGraph:
1323 """Create execution graph for a pipeline.
1325 Parameters
1326 ----------
1327 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1328 Pipeline definition, task names/classes and their configs.
1329 collections
1330 Expressions representing the collections to search for input
1331 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1332 run : `str`, optional
1333 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1334 output datasets, if it already exists.
1335 userQuery : `str`
1336 String which defines user-defined selection for registry, should be
1337 empty or `None` if there is no restrictions on data selection.
1338 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1339 The query constraint variant that should be used to constraint the
1340 query based on dataset existance, defaults to
1341 `DatasetQueryConstraintVariant.ALL`.
1342 metadata : Optional Mapping of `str` to primitives
1343 This is an optional parameter of extra data to carry with the
1344 graph. Entries in this mapping should be able to be serialized in
1345 JSON.
1346 resolveRefs : `bool`, optional
1347 If `True` then resolve all input references and generate random
1348 dataset IDs for all output and intermediate datasets. True value
1349 requires ``run`` collection to be specified.
1350 bind : `Mapping`, optional
1351 Mapping containing literal values that should be injected into the
1352 ``userQuery`` expression, keyed by the identifiers they replace.
1354 Returns
1355 -------
1356 graph : `QuantumGraph`
1358 Raises
1359 ------
1360 UserExpressionError
1361 Raised when user expression cannot be parsed.
1362 OutputExistsError
1363 Raised when output datasets already exist.
1364 Exception
1365 Other exceptions types may be raised by underlying registry
1366 classes.
1367 """
1368 if resolveRefs and run is None:
1369 raise ValueError("`resolveRefs` requires `run` parameter.")
1370 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1371 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1372 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1373 instrument_class: Optional[Any] = None
1374 if isinstance(pipeline, Pipeline):
1375 instrument_class_name = pipeline.getInstrument()
1376 if instrument_class_name is not None:
1377 instrument_class = doImportType(instrument_class_name)
1378 pipeline = list(pipeline.toExpandedPipeline())
1379 if instrument_class is not None:
1380 dataId = DataCoordinate.standardize(
1381 instrument=instrument_class.getName(), universe=self.registry.dimensions
1382 )
1383 else:
1384 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1385 with scaffolding.connectDataIds(
1386 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1387 ) as commonDataIds:
1388 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1389 scaffolding.resolveDatasetRefs(
1390 self.registry,
1391 collections,
1392 run,
1393 commonDataIds,
1394 skipExistingIn=self.skipExistingIn,
1395 clobberOutputs=self.clobberOutputs,
1396 constrainedByAllDatasets=condition,
1397 resolveRefs=resolveRefs,
1398 )
1399 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)