Coverage for python/lsst/pipe/base/graphBuilder.py: 17%
425 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-01 01:55 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-01 01:55 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
38from lsst.daf.butler import (
39 CollectionSearch,
40 CollectionType,
41 DataCoordinate,
42 DatasetIdGenEnum,
43 DatasetRef,
44 DatasetType,
45 Datastore,
46 DatastoreRecordData,
47 DimensionGraph,
48 DimensionUniverse,
49 NamedKeyDict,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
54from lsst.utils import doImportType
56from ._datasetQueryConstraints import DatasetQueryConstraintVariant
57from ._status import NoWorkFound
59# -----------------------------
60# Imports for other modules --
61# -----------------------------
62from .connections import AdjustQuantumHelper, iterConnections
63from .graph import QuantumGraph
64from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
66# ----------------------------------
67# Local non-exported definitions --
68# ----------------------------------
70_LOG = logging.getLogger(__name__)
73class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
74 """A custom dictionary that maps `DatasetType` to a nested dictionary of
75 the known `DatasetRef` instances of that type.
77 Parameters
78 ----------
79 args
80 Positional arguments are forwarded to the `dict` constructor.
81 universe : `DimensionUniverse`
82 Universe of all possible dimensions.
83 """
85 def __init__(self, *args: Any, universe: DimensionUniverse):
86 super().__init__(*args)
87 self.universe = universe
89 @classmethod
90 def fromDatasetTypes(
91 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
92 ) -> _DatasetDict:
93 """Construct a dictionary from a flat iterable of `DatasetType` keys.
95 Parameters
96 ----------
97 datasetTypes : `iterable` of `DatasetType`
98 DatasetTypes to use as keys for the dict. Values will be empty
99 dictionaries.
100 universe : `DimensionUniverse`
101 Universe of all possible dimensions.
103 Returns
104 -------
105 dictionary : `_DatasetDict`
106 A new `_DatasetDict` instance.
107 """
108 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
110 @classmethod
111 def fromSubset(
112 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
113 ) -> _DatasetDict:
114 """Return a new dictionary by extracting items corresponding to the
115 given keys from one or more existing dictionaries.
117 Parameters
118 ----------
119 datasetTypes : `iterable` of `DatasetType`
120 DatasetTypes to use as keys for the dict. Values will be obtained
121 by lookups against ``first`` and ``rest``.
122 first : `_DatasetDict`
123 Another dictionary from which to extract values.
124 rest
125 Additional dictionaries from which to extract values.
127 Returns
128 -------
129 dictionary : `_DatasetDict`
130 A new dictionary instance.
131 """
132 combined = ChainMap(first, *rest)
134 # Dataset types known to match immediately can be processed
135 # without checks.
136 matches = combined.keys() & set(datasetTypes)
137 _dict = {k: combined[k] for k in matches}
139 if len(_dict) < len(datasetTypes):
140 # Work out which ones are missing.
141 missing_datasetTypes = set(datasetTypes) - _dict.keys()
143 # Get the known names for comparison.
144 combined_by_name = {k.name: k for k in combined}
146 missing = set()
147 incompatible = {}
148 for datasetType in missing_datasetTypes:
149 # The dataset type is not found. It may not be listed
150 # or it may be that it is there with the same name
151 # but different definition.
152 if datasetType.name in combined_by_name:
153 # This implies some inconsistency in definitions
154 # for connections. If there is support for storage
155 # class conversion we can let it slide.
156 # At this point we do not know
157 # where the inconsistency is but trust that down
158 # stream code will be more explicit about input
159 # vs output incompatibilities.
160 existing = combined_by_name[datasetType.name]
161 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
162 _LOG.warning(
163 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
164 datasetType,
165 existing,
166 )
167 _dict[datasetType] = combined[existing]
168 else:
169 incompatible[datasetType] = existing
170 else:
171 missing.add(datasetType)
173 if missing or incompatible:
174 reasons = []
175 if missing:
176 reasons.append(
177 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
178 + ", ".join(d.name for d in combined)
179 )
180 if incompatible:
181 for x, y in incompatible.items():
182 reasons.append(f"{x} incompatible with {y}")
183 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
185 return cls(_dict, universe=first.universe)
187 @property
188 def dimensions(self) -> DimensionGraph:
189 """The union of all dimensions used by all dataset types in this
190 dictionary, including implied dependencies (`DimensionGraph`).
191 """
192 base = self.universe.empty
193 if len(self) == 0:
194 return base
195 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
197 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
198 """Unpack nested single-element `DatasetRef` dicts into a new
199 mapping with `DatasetType` keys and `DatasetRef` values.
201 This method assumes that each nest contains exactly one item, as is the
202 case for all "init" datasets.
204 Returns
205 -------
206 dictionary : `NamedKeyDict`
207 Dictionary mapping `DatasetType` to `DatasetRef`, with both
208 `DatasetType` instances and string names usable as keys.
209 """
211 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
212 (ref,) = refs.values()
213 return ref
215 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
217 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
218 """Unpack nested multi-element `DatasetRef` dicts into a new
219 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
221 Returns
222 -------
223 dictionary : `NamedKeyDict`
224 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
225 both `DatasetType` instances and string names usable as keys.
226 """
227 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
229 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
230 """Iterate over the contained `DatasetRef` instances that match the
231 given `DatasetType` and data IDs.
233 Parameters
234 ----------
235 datasetType : `DatasetType`
236 Dataset type to match.
237 dataIds : `Iterable` [ `DataCoordinate` ]
238 Data IDs to match.
240 Returns
241 -------
242 refs : `Iterator` [ `DatasetRef` ]
243 DatasetRef instances for which ``ref.datasetType == datasetType``
244 and ``ref.dataId`` is in ``dataIds``.
245 """
246 refs = self[datasetType]
247 return (refs[dataId] for dataId in dataIds)
250class _QuantumScaffolding:
251 """Helper class aggregating information about a `Quantum`, used when
252 constructing a `QuantumGraph`.
254 See `_PipelineScaffolding` for a top-down description of the full
255 scaffolding data structure.
257 Parameters
258 ----------
259 task : _TaskScaffolding
260 Back-reference to the helper object for the `PipelineTask` this quantum
261 represents an execution of.
262 dataId : `DataCoordinate`
263 Data ID for this quantum.
264 """
266 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
267 self.task = task
268 self.dataId = dataId
269 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
270 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
271 self.prerequisites = _DatasetDict.fromDatasetTypes(
272 task.prerequisites.keys(), universe=dataId.universe
273 )
275 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
277 def __repr__(self) -> str:
278 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
280 task: _TaskScaffolding
281 """Back-reference to the helper object for the `PipelineTask` this quantum
282 represents an execution of.
283 """
285 dataId: DataCoordinate
286 """Data ID for this quantum.
287 """
289 inputs: _DatasetDict
290 """Nested dictionary containing `DatasetRef` inputs to this quantum.
292 This is initialized to map each `DatasetType` to an empty dictionary at
293 construction. Those nested dictionaries are populated (with data IDs as
294 keys) with unresolved `DatasetRef` instances in
295 `_PipelineScaffolding.connectDataIds`.
296 """
298 outputs: _DatasetDict
299 """Nested dictionary containing `DatasetRef` outputs this quantum.
300 """
302 prerequisites: _DatasetDict
303 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
304 quantum.
305 """
307 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
308 """Transform the scaffolding object into a true `Quantum` instance.
310 Parameters
311 ----------
312 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
313 If not `None` then fill datastore records in each generated Quantum
314 using the records from this structure.
316 Returns
317 -------
318 quantum : `Quantum`
319 An actual `Quantum` instance.
320 """
321 allInputs = self.inputs.unpackMultiRefs()
322 allInputs.update(self.prerequisites.unpackMultiRefs())
323 # Give the task's Connections class an opportunity to remove some
324 # inputs, or complain if they are unacceptable.
325 # This will raise if one of the check conditions is not met, which is
326 # the intended behavior.
327 # If it raises NotWorkFound, there is a bug in the QG algorithm
328 # or the adjustQuantum is incorrectly trying to make a prerequisite
329 # input behave like a regular input; adjustQuantum should only raise
330 # NoWorkFound if a regular input is missing, and it shouldn't be
331 # possible for us to have generated ``self`` if that's true.
332 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
333 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
334 initInputs = self.task.initInputs.unpackSingleRefs()
335 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
336 if datastore_records is not None:
337 quantum_records = {}
338 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
339 input_refs += list(initInputs.values())
340 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
341 for datastore_name, records in datastore_records.items():
342 matching_records = records.subset(input_ids)
343 if matching_records is not None:
344 quantum_records[datastore_name] = matching_records
345 return Quantum(
346 taskName=self.task.taskDef.taskName,
347 taskClass=self.task.taskDef.taskClass,
348 dataId=self.dataId,
349 initInputs=initInputs,
350 inputs=helper.inputs,
351 outputs=helper.outputs,
352 datastore_records=quantum_records,
353 )
356@dataclass
357class _TaskScaffolding:
358 """Helper class aggregating information about a `PipelineTask`, used when
359 constructing a `QuantumGraph`.
361 See `_PipelineScaffolding` for a top-down description of the full
362 scaffolding data structure.
364 Parameters
365 ----------
366 taskDef : `TaskDef`
367 Data structure that identifies the task class and its config.
368 parent : `_PipelineScaffolding`
369 The parent data structure that will hold the instance being
370 constructed.
371 datasetTypes : `TaskDatasetTypes`
372 Data structure that categorizes the dataset types used by this task.
373 """
375 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
376 universe = parent.dimensions.universe
377 self.taskDef = taskDef
378 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
379 assert self.dimensions.issubset(parent.dimensions)
380 # Initialize _DatasetDicts as subsets of the one or two
381 # corresponding dicts in the parent _PipelineScaffolding.
382 self.initInputs = _DatasetDict.fromSubset(
383 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
384 )
385 self.initOutputs = _DatasetDict.fromSubset(
386 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
387 )
388 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
389 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
390 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
391 self.dataIds: Set[DataCoordinate] = set()
392 self.quanta = {}
394 def __repr__(self) -> str:
395 # Default dataclass-injected __repr__ gets caught in an infinite loop
396 # because of back-references.
397 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
399 taskDef: TaskDef
400 """Data structure that identifies the task class and its config
401 (`TaskDef`).
402 """
404 dimensions: DimensionGraph
405 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
406 """
408 initInputs: _DatasetDict
409 """Dictionary containing information about datasets used to construct this
410 task (`_DatasetDict`).
411 """
413 initOutputs: _DatasetDict
414 """Dictionary containing information about datasets produced as a
415 side-effect of constructing this task (`_DatasetDict`).
416 """
418 inputs: _DatasetDict
419 """Dictionary containing information about datasets used as regular,
420 graph-constraining inputs to this task (`_DatasetDict`).
421 """
423 outputs: _DatasetDict
424 """Dictionary containing information about datasets produced by this task
425 (`_DatasetDict`).
426 """
428 prerequisites: _DatasetDict
429 """Dictionary containing information about input datasets that must be
430 present in the repository before any Pipeline containing this task is run
431 (`_DatasetDict`).
432 """
434 quanta: Dict[DataCoordinate, _QuantumScaffolding]
435 """Dictionary mapping data ID to a scaffolding object for the Quantum of
436 this task with that data ID.
437 """
439 def makeQuantumSet(
440 self,
441 unresolvedRefs: Optional[Set[DatasetRef]] = None,
442 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
443 ) -> Set[Quantum]:
444 """Create a `set` of `Quantum` from the information in ``self``.
446 Parameters
447 ----------
448 unresolvedRefs : `set` [ `DatasetRef` ], optional
449 Input dataset refs that have not been found.
450 datastore_records : `dict`
453 Returns
454 -------
455 nodes : `set` of `Quantum`
456 The `Quantum` elements corresponding to this task.
457 """
458 if unresolvedRefs is None:
459 unresolvedRefs = set()
460 outputs = set()
461 for q in self.quanta.values():
462 try:
463 tmpQuanta = q.makeQuantum(datastore_records)
464 outputs.add(tmpQuanta)
465 except (NoWorkFound, FileNotFoundError) as exc:
466 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
467 if unresolvedRefs.intersection(refs):
468 # This means it is a node that is Known to be pruned
469 # later and should be left in even though some follow up
470 # queries fail. This allows the pruning to start from this
471 # quantum with known issues, and prune other nodes it
472 # touches
473 inputs = q.inputs.unpackMultiRefs()
474 inputs.update(q.prerequisites.unpackMultiRefs())
475 tmpQuantum = Quantum(
476 taskName=q.task.taskDef.taskName,
477 taskClass=q.task.taskDef.taskClass,
478 dataId=q.dataId,
479 initInputs=q.task.initInputs.unpackSingleRefs(),
480 inputs=inputs,
481 outputs=q.outputs.unpackMultiRefs(),
482 )
483 outputs.add(tmpQuantum)
484 else:
485 raise exc
486 return outputs
489class _DatasetIdMaker:
490 """Helper class which generates random dataset UUIDs for unresolved
491 datasets.
492 """
494 def __init__(self, registry: Registry, run: str):
495 self.datasetIdFactory = registry.datasetIdFactory
496 self.run = run
497 # Dataset IDs generated so far
498 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {}
500 def resolveRef(self, ref: DatasetRef) -> DatasetRef:
501 if ref.id is not None:
502 return ref
503 key = ref.datasetType, ref.dataId
504 if (resolved := self.resolved.get(key)) is None:
505 datasetId = self.datasetIdFactory.makeDatasetId(
506 self.run, ref.datasetType, ref.dataId, DatasetIdGenEnum.UNIQUE
507 )
508 resolved = ref.resolved(datasetId, self.run)
509 self.resolved[key] = resolved
510 return resolved
512 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]:
513 """Resolve all unresolved references in the provided dictionary."""
514 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()}
517@dataclass
518class _PipelineScaffolding:
519 """A helper data structure that organizes the information involved in
520 constructing a `QuantumGraph` for a `Pipeline`.
522 Parameters
523 ----------
524 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
525 Sequence of tasks from which a graph is to be constructed. Must
526 have nested task classes already imported.
527 universe : `DimensionUniverse`
528 Universe of all possible dimensions.
530 Notes
531 -----
532 The scaffolding data structure contains nested data structures for both
533 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
534 data structures are shared between the pipeline-level structure (which
535 aggregates all datasets and categorizes them from the perspective of the
536 complete pipeline) and the individual tasks that use them as inputs and
537 outputs.
539 `QuantumGraph` construction proceeds in four steps, with each corresponding
540 to a different `_PipelineScaffolding` method:
542 1. When `_PipelineScaffolding` is constructed, we extract and categorize
543 the DatasetTypes used by the pipeline (delegating to
544 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
545 nested `_TaskScaffolding` and `_DatasetDict` objects.
547 2. In `connectDataIds`, we construct and run the "Big Join Query", which
548 returns related tuples of all dimensions used to identify any regular
549 input, output, and intermediate datasets (not prerequisites). We then
550 iterate over these tuples of related dimensions, identifying the subsets
551 that correspond to distinct data IDs for each task and dataset type,
552 and then create `_QuantumScaffolding` objects.
554 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
555 dataset data IDs previously identified, transforming unresolved
556 DatasetRefs into resolved DatasetRefs where appropriate. We then look
557 up prerequisite datasets for all quanta.
559 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
560 per-task `_QuantumScaffolding` objects.
561 """
563 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
564 _LOG.debug("Initializing data structures for QuantumGraph generation.")
565 self.tasks = []
566 # Aggregate and categorize the DatasetTypes in the Pipeline.
567 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
568 # Construct dictionaries that map those DatasetTypes to structures
569 # that will (later) hold addiitonal information about them.
570 for attr in (
571 "initInputs",
572 "initIntermediates",
573 "initOutputs",
574 "inputs",
575 "intermediates",
576 "outputs",
577 "prerequisites",
578 ):
579 setattr(
580 self,
581 attr,
582 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
583 )
584 # Aggregate all dimensions for all non-init, non-prerequisite
585 # DatasetTypes. These are the ones we'll include in the big join
586 # query.
587 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
588 # Construct scaffolding nodes for each Task, and add backreferences
589 # to the Task from each DatasetScaffolding node.
590 # Note that there's only one scaffolding node for each DatasetType,
591 # shared by _PipelineScaffolding and all _TaskScaffoldings that
592 # reference it.
593 if isinstance(pipeline, Pipeline):
594 pipeline = pipeline.toExpandedPipeline()
595 self.tasks = [
596 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
597 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
598 ]
600 def __repr__(self) -> str:
601 # Default dataclass-injected __repr__ gets caught in an infinite loop
602 # because of back-references.
603 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
605 tasks: List[_TaskScaffolding]
606 """Scaffolding data structures for each task in the pipeline
607 (`list` of `_TaskScaffolding`).
608 """
610 initInputs: _DatasetDict
611 """Datasets consumed but not produced when constructing the tasks in this
612 pipeline (`_DatasetDict`).
613 """
615 initIntermediates: _DatasetDict
616 """Datasets that are both consumed and produced when constructing the tasks
617 in this pipeline (`_DatasetDict`).
618 """
620 initOutputs: _DatasetDict
621 """Datasets produced but not consumed when constructing the tasks in this
622 pipeline (`_DatasetDict`).
623 """
625 inputs: _DatasetDict
626 """Datasets that are consumed but not produced when running this pipeline
627 (`_DatasetDict`).
628 """
630 intermediates: _DatasetDict
631 """Datasets that are both produced and consumed when running this pipeline
632 (`_DatasetDict`).
633 """
635 outputs: _DatasetDict
636 """Datasets produced but not consumed when when running this pipeline
637 (`_DatasetDict`).
638 """
640 prerequisites: _DatasetDict
641 """Datasets that are consumed when running this pipeline and looked up
642 per-Quantum when generating the graph (`_DatasetDict`).
643 """
645 dimensions: DimensionGraph
646 """All dimensions used by any regular input, intermediate, or output
647 (not prerequisite) dataset; the set of dimension used in the "Big Join
648 Query" (`DimensionGraph`).
650 This is required to be a superset of all task quantum dimensions.
651 """
653 @contextmanager
654 def connectDataIds(
655 self,
656 registry: Registry,
657 collections: Any,
658 userQuery: Optional[str],
659 externalDataId: DataCoordinate,
660 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
661 ) -> Iterator[DataCoordinateQueryResults]:
662 """Query for the data IDs that connect nodes in the `QuantumGraph`.
664 This method populates `_TaskScaffolding.dataIds` and
665 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
667 Parameters
668 ----------
669 registry : `lsst.daf.butler.Registry`
670 Registry for the data repository; used for all data ID queries.
671 collections
672 Expressions representing the collections to search for input
673 datasets. May be any of the types accepted by
674 `lsst.daf.butler.CollectionSearch.fromExpression`.
675 userQuery : `str` or `None`
676 User-provided expression to limit the data IDs processed.
677 externalDataId : `DataCoordinate`
678 Externally-provided data ID that should be used to restrict the
679 results, just as if these constraints had been included via ``AND``
680 in ``userQuery``. This includes (at least) any instrument named
681 in the pipeline definition.
682 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
683 The query constraint variant that should be used to constraint the
684 query based on dataset existance, defaults to
685 `DatasetQueryConstraintVariant.ALL`.
687 Returns
688 -------
689 commonDataIds : \
690 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
691 An interface to a database temporary table containing all data IDs
692 that will appear in this `QuantumGraph`. Returned inside a
693 context manager, which will drop the temporary table at the end of
694 the `with` block in which this method is called.
695 """
696 _LOG.debug("Building query for data IDs.")
697 # Initialization datasets always have empty data IDs.
698 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
699 for datasetType, refs in itertools.chain(
700 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
701 ):
702 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
703 # Run one big query for the data IDs for task dimensions and regular
704 # inputs and outputs. We limit the query to only dimensions that are
705 # associated with the input dataset types, but don't (yet) try to
706 # obtain the dataset_ids for those inputs.
707 _LOG.debug("Submitting data ID query and materializing results.")
708 queryArgs: Dict[str, Any] = {
709 "dimensions": self.dimensions,
710 "where": userQuery,
711 "dataId": externalDataId,
712 }
713 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
714 _LOG.debug("Constraining graph query using all datasets in pipeline.")
715 queryArgs["datasets"] = list(self.inputs)
716 queryArgs["collections"] = collections
717 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
718 _LOG.debug("Not using dataset existence to constrain query.")
719 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
720 constraint = set(datasetQueryConstraint)
721 inputs = {k.name: k for k in self.inputs.keys()}
722 if remainder := constraint.difference(inputs.keys()):
723 raise ValueError(
724 f"{remainder} dataset type(s) specified as a graph constraint, but"
725 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
726 )
727 _LOG.debug(f"Constraining graph query using {constraint}")
728 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
729 queryArgs["collections"] = collections
730 else:
731 raise ValueError(
732 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
733 )
735 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
736 _LOG.debug("Expanding data IDs.")
737 commonDataIds = commonDataIds.expanded()
738 _LOG.debug("Iterating over query results to associate quanta with datasets.")
739 # Iterate over query results, populating data IDs for datasets and
740 # quanta and then connecting them to each other.
741 n = -1
742 for n, commonDataId in enumerate(commonDataIds):
743 _LOG.debug("Next DataID = %s", commonDataId)
744 # Create DatasetRefs for all DatasetTypes from this result row,
745 # noting that we might have created some already.
746 # We remember both those that already existed and those that we
747 # create now.
748 refsForRow = {}
749 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
750 for datasetType, refs in itertools.chain(
751 self.inputs.items(), self.intermediates.items(), self.outputs.items()
752 ):
753 datasetDataId: Optional[DataCoordinate]
754 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
755 datasetDataId = commonDataId.subset(datasetType.dimensions)
756 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
757 ref = refs.get(datasetDataId)
758 if ref is None:
759 ref = DatasetRef(datasetType, datasetDataId)
760 _LOG.debug("Made new ref = %s", ref)
761 refs[datasetDataId] = ref
762 refsForRow[datasetType.name] = ref
763 # Create _QuantumScaffolding objects for all tasks from this
764 # result row, noting that we might have created some already.
765 for task in self.tasks:
766 quantumDataId = commonDataId.subset(task.dimensions)
767 quantum = task.quanta.get(quantumDataId)
768 if quantum is None:
769 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
770 task.quanta[quantumDataId] = quantum
771 # Whether this is a new quantum or an existing one, we can
772 # now associate the DatasetRefs for this row with it. The
773 # fact that a Quantum data ID and a dataset data ID both
774 # came from the same result row is what tells us they
775 # should be associated.
776 # Many of these associates will be duplicates (because
777 # another query row that differed from this one only in
778 # irrelevant dimensions already added them), and we use
779 # sets to skip.
780 for datasetType in task.inputs:
781 ref = refsForRow[datasetType.name]
782 quantum.inputs[datasetType.name][ref.dataId] = ref
783 for datasetType in task.outputs:
784 ref = refsForRow[datasetType.name]
785 quantum.outputs[datasetType.name][ref.dataId] = ref
786 if n < 0:
787 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
788 emptiness_explained = False
789 for message in commonDataIds.explain_no_results():
790 _LOG.critical(message)
791 emptiness_explained = True
792 if not emptiness_explained:
793 _LOG.critical(
794 "To reproduce this query for debugging purposes, run "
795 "Registry.queryDataIds with these arguments:"
796 )
797 # We could just repr() the queryArgs dict to get something
798 # the user could make sense of, but it's friendlier to
799 # put these args in an easier-to-construct equivalent form
800 # so they can read it more easily and copy and paste into
801 # a Python terminal.
802 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
803 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
804 if queryArgs["where"]:
805 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
806 if "datasets" in queryArgs:
807 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
808 if "collections" in queryArgs:
809 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
810 _LOG.debug("Finished processing %d rows from data ID query.", n)
811 yield commonDataIds
813 def resolveDatasetRefs(
814 self,
815 registry: Registry,
816 collections: Any,
817 run: Optional[str],
818 commonDataIds: DataCoordinateQueryResults,
819 *,
820 skipExistingIn: Any = None,
821 clobberOutputs: bool = True,
822 constrainedByAllDatasets: bool = True,
823 resolveRefs: bool = False,
824 ) -> None:
825 """Perform follow up queries for each dataset data ID produced in
826 `fillDataIds`.
828 This method populates `_DatasetScaffolding.refs` (except for those in
829 `prerequisites`).
831 Parameters
832 ----------
833 registry : `lsst.daf.butler.Registry`
834 Registry for the data repository; used for all data ID queries.
835 collections
836 Expressions representing the collections to search for input
837 datasets. May be any of the types accepted by
838 `lsst.daf.butler.CollectionSearch.fromExpression`.
839 run : `str`, optional
840 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
841 output datasets, if it already exists.
842 commonDataIds : \
843 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
844 Result of a previous call to `connectDataIds`.
845 skipExistingIn
846 Expressions representing the collections to search for existing
847 output datasets that should be skipped. May be any of the types
848 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
849 `None` or empty string/sequence disables skipping.
850 clobberOutputs : `bool`, optional
851 If `True` (default), allow quanta to created even if outputs exist;
852 this requires the same behavior behavior to be enabled when
853 executing. If ``skipExistingIn`` is not `None`, completed quanta
854 (those with metadata, or all outputs if there is no metadata
855 dataset configured) will be skipped rather than clobbered.
856 constrainedByAllDatasets : `bool`, optional
857 Indicates if the commonDataIds were generated with a constraint on
858 all dataset types.
859 resolveRefs : `bool`, optional
860 If `True` then resolve all input references and generate random
861 dataset IDs for all output and intermediate datasets. True value
862 requires ``run`` collection to be specified.
864 Raises
865 ------
866 OutputExistsError
867 Raised if an output dataset already exists in the output run
868 and ``skipExistingIn`` does not include output run, or if only
869 some outputs are present and ``clobberOutputs`` is `False`.
870 """
871 skipCollections: Optional[CollectionSearch] = None
872 skipExistingInRun = False
873 if skipExistingIn:
874 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
875 if run:
876 # as optimization check in the explicit list of names first
877 skipExistingInRun = run in skipCollections.explicitNames()
878 if not skipExistingInRun:
879 # need to flatten it and check again
880 skipExistingInRun = run in registry.queryCollections(
881 skipExistingIn,
882 collectionTypes=CollectionType.RUN,
883 )
885 idMaker: Optional[_DatasetIdMaker] = None
886 if resolveRefs:
887 assert run is not None, "run cannot be None when resolveRefs is True"
888 idMaker = _DatasetIdMaker(registry, run)
890 # Look up [init] intermediate and output datasets in the output
891 # collection, if there is an output collection.
892 if run is not None or skipCollections is not None:
893 for datasetType, refs in itertools.chain(
894 self.initIntermediates.items(),
895 self.initOutputs.items(),
896 self.intermediates.items(),
897 self.outputs.items(),
898 ):
899 _LOG.debug(
900 "Resolving %d datasets for intermediate and/or output dataset %s.",
901 len(refs),
902 datasetType.name,
903 )
904 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
905 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
907 # look at RUN collection first
908 if run is not None:
909 resolvedRefQueryResults = subset.findDatasets(
910 datasetType, collections=run, findFirst=True
911 )
912 for resolvedRef in resolvedRefQueryResults:
913 # TODO: we could easily support per-DatasetType
914 # skipExisting and I could imagine that being useful -
915 # it's probably required in order to support writing
916 # initOutputs before QuantumGraph generation.
917 assert resolvedRef.dataId in refs
918 if not (skipExistingInRun or isInit or clobberOutputs):
919 raise OutputExistsError(
920 f"Output dataset {datasetType.name} already exists in "
921 f"output RUN collection '{run}' with data ID"
922 f" {resolvedRef.dataId}."
923 )
924 # If we are going to resolve all outputs then we have
925 # to remember existing ones to avoid generating new
926 # dataset IDs for them.
927 if resolveRefs:
928 refs[resolvedRef.dataId] = resolvedRef
930 # And check skipExistingIn too, if RUN collection is in
931 # it is handled above
932 if skipCollections is not None:
933 resolvedRefQueryResults = subset.findDatasets(
934 datasetType, collections=skipCollections, findFirst=True
935 )
936 for resolvedRef in resolvedRefQueryResults:
937 assert resolvedRef.dataId in refs
938 refs[resolvedRef.dataId] = resolvedRef
940 # Look up input and initInput datasets in the input collection(s).
941 # container to accumulate unfound refs, if the common dataIs were not
942 # constrained on dataset type existence.
943 self.unfoundRefs = set()
944 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
945 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
946 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets(
947 datasetType, collections=collections, findFirst=True
948 )
949 dataIdsNotFoundYet = set(refs.keys())
950 for resolvedRef in resolvedRefQueryResults:
951 dataIdsNotFoundYet.discard(resolvedRef.dataId)
952 refs[resolvedRef.dataId] = resolvedRef
953 if dataIdsNotFoundYet:
954 if constrainedByAllDatasets:
955 raise RuntimeError(
956 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
957 f"'{datasetType.name}' was/were present in a previous "
958 f"query, but could not be found now."
959 f"This is either a logic bug in QuantumGraph generation "
960 f"or the input collections have been modified since "
961 f"QuantumGraph generation began."
962 )
963 else:
964 # if the common dataIds were not constrained using all the
965 # input dataset types, it is possible that some data ids
966 # found dont correspond to existing dataset types and they
967 # will be un-resolved. Mark these for later pruning from
968 # the quantum graph.
969 for k in dataIdsNotFoundYet:
970 self.unfoundRefs.add(refs[k])
972 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
973 # replacing the unresolved refs there, and then look up prerequisites.
974 for task in self.tasks:
975 _LOG.debug(
976 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
977 len(task.quanta),
978 task.taskDef.label,
979 )
980 # The way iterConnections is designed makes it impossible to
981 # annotate precisely enough to satisfy MyPy here.
982 lookupFunctions = {
983 c.name: c.lookupFunction # type: ignore
984 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
985 if c.lookupFunction is not None # type: ignore
986 }
987 dataIdsFailed = []
988 dataIdsSucceeded = []
989 for quantum in task.quanta.values():
990 # Process outputs datasets only if skipExistingIn is not None
991 # or there is a run to look for outputs in and clobberOutputs
992 # is True. Note that if skipExistingIn is None, any output
993 # datasets that already exist would have already caused an
994 # exception to be raised. We never update the DatasetRefs in
995 # the quantum because those should never be resolved.
996 if skipCollections is not None or (run is not None and clobberOutputs):
997 resolvedRefs = []
998 unresolvedRefs = []
999 haveMetadata = False
1000 for datasetType, originalRefs in quantum.outputs.items():
1001 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
1002 if ref.id is not None:
1003 resolvedRefs.append(ref)
1004 if datasetType.name == task.taskDef.metadataDatasetName:
1005 haveMetadata = True
1006 else:
1007 unresolvedRefs.append(ref)
1008 if resolvedRefs:
1009 if haveMetadata or not unresolvedRefs:
1010 dataIdsSucceeded.append(quantum.dataId)
1011 if skipCollections is not None:
1012 continue
1013 else:
1014 dataIdsFailed.append(quantum.dataId)
1015 if not clobberOutputs:
1016 raise OutputExistsError(
1017 f"Quantum {quantum.dataId} of task with label "
1018 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1019 f"({resolvedRefs}) "
1020 f"and others that don't ({unresolvedRefs}), with no metadata output, "
1021 "and clobbering outputs was not enabled."
1022 )
1023 # Update the input DatasetRefs to the resolved ones we already
1024 # searched for.
1025 for datasetType, input_refs in quantum.inputs.items():
1026 for ref in task.inputs.extract(datasetType, input_refs.keys()):
1027 input_refs[ref.dataId] = ref
1028 # Look up prerequisite datasets in the input collection(s).
1029 # These may have dimensions that extend beyond those we queried
1030 # for originally, because we want to permit those data ID
1031 # values to differ across quanta and dataset types.
1032 for datasetType in task.prerequisites:
1033 lookupFunction = lookupFunctions.get(datasetType.name)
1034 if lookupFunction is not None:
1035 # PipelineTask has provided its own function to do the
1036 # lookup. This always takes precedence.
1037 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1038 elif (
1039 datasetType.isCalibration()
1040 and datasetType.dimensions <= quantum.dataId.graph
1041 and quantum.dataId.graph.temporal
1042 ):
1043 # This is a master calibration lookup, which we have to
1044 # handle specially because the query system can't do a
1045 # temporal join on a non-dimension-based timespan yet.
1046 timespan = quantum.dataId.timespan
1047 try:
1048 prereq_refs = [
1049 registry.findDataset(
1050 datasetType, quantum.dataId, collections=collections, timespan=timespan
1051 )
1052 ]
1053 except KeyError:
1054 # This dataset type is not present in the registry,
1055 # which just means there are no datasets here.
1056 prereq_refs = []
1057 else:
1058 # Most general case.
1059 prereq_refs = list(
1060 registry.queryDatasets(
1061 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True
1062 ).expanded()
1063 )
1064 quantum.prerequisites[datasetType].update(
1065 {ref.dataId: ref for ref in prereq_refs if ref is not None}
1066 )
1068 # Resolve all quantum inputs and outputs.
1069 if idMaker:
1070 for datasetDict in (quantum.inputs, quantum.outputs):
1071 for refDict in datasetDict.values():
1072 refDict.update(idMaker.resolveDict(refDict))
1074 # Resolve task initInputs and initOutputs.
1075 if idMaker:
1076 for datasetDict in (task.initInputs, task.initOutputs):
1077 for refDict in datasetDict.values():
1078 refDict.update(idMaker.resolveDict(refDict))
1080 # Actually remove any quanta that we decided to skip above.
1081 if dataIdsSucceeded:
1082 if skipCollections is not None:
1083 _LOG.debug(
1084 "Pruning successful %d quanta for task with label '%s' because all of their "
1085 "outputs exist or metadata was written successfully.",
1086 len(dataIdsSucceeded),
1087 task.taskDef.label,
1088 )
1089 for dataId in dataIdsSucceeded:
1090 del task.quanta[dataId]
1091 elif clobberOutputs:
1092 _LOG.info(
1093 "Found %d successful quanta for task with label '%s' "
1094 "that will need to be clobbered during execution.",
1095 len(dataIdsSucceeded),
1096 task.taskDef.label,
1097 )
1098 else:
1099 raise AssertionError("OutputExistsError should have already been raised.")
1100 if dataIdsFailed:
1101 if clobberOutputs:
1102 _LOG.info(
1103 "Found %d failed/incomplete quanta for task with label '%s' "
1104 "that will need to be clobbered during execution.",
1105 len(dataIdsFailed),
1106 task.taskDef.label,
1107 )
1108 else:
1109 raise AssertionError("OutputExistsError should have already been raised.")
1111 def makeQuantumGraph(
1112 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None
1113 ) -> QuantumGraph:
1114 """Create a `QuantumGraph` from the quanta already present in
1115 the scaffolding data structure.
1117 Parameters
1118 ---------
1119 metadata : Optional Mapping of `str` to primitives
1120 This is an optional parameter of extra data to carry with the
1121 graph. Entries in this mapping should be able to be serialized in
1122 JSON.
1123 datastore : `Datastore`, optional
1124 If not `None` then fill datastore records in each generated
1125 Quantum.
1127 Returns
1128 -------
1129 graph : `QuantumGraph`
1130 The full `QuantumGraph`.
1131 """
1133 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1134 """Extract all DatasetRefs from the dictionaries"""
1135 for ref_dict in dataset_dict.values():
1136 yield from ref_dict.values()
1138 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1139 if datastore is not None:
1140 datastore_records = datastore.export_records(
1141 itertools.chain(
1142 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1143 )
1144 )
1146 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1147 for task in self.tasks:
1148 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1149 graphInput[task.taskDef] = qset
1151 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks}
1152 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks}
1154 graph = QuantumGraph(
1155 graphInput,
1156 metadata=metadata,
1157 pruneRefs=self.unfoundRefs,
1158 universe=self.dimensions.universe,
1159 initInputs=taskInitInputs,
1160 initOutputs=taskInitOutputs,
1161 )
1162 return graph
1165# ------------------------
1166# Exported definitions --
1167# ------------------------
1170class GraphBuilderError(Exception):
1171 """Base class for exceptions generated by graph builder."""
1173 pass
1176class OutputExistsError(GraphBuilderError):
1177 """Exception generated when output datasets already exist."""
1179 pass
1182class PrerequisiteMissingError(GraphBuilderError):
1183 """Exception generated when a prerequisite dataset does not exist."""
1185 pass
1188class GraphBuilder:
1189 """GraphBuilder class is responsible for building task execution graph from
1190 a Pipeline.
1192 Parameters
1193 ----------
1194 registry : `~lsst.daf.butler.Registry`
1195 Data butler instance.
1196 skipExistingIn
1197 Expressions representing the collections to search for existing
1198 output datasets that should be skipped. May be any of the types
1199 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
1200 clobberOutputs : `bool`, optional
1201 If `True` (default), allow quanta to created even if partial outputs
1202 exist; this requires the same behavior behavior to be enabled when
1203 executing.
1204 datastore : `Datastore`, optional
1205 If not `None` then fill datastore records in each generated Quantum.
1206 """
1208 def __init__(
1209 self,
1210 registry: Registry,
1211 skipExistingIn: Any = None,
1212 clobberOutputs: bool = True,
1213 datastore: Optional[Datastore] = None,
1214 ):
1215 self.registry = registry
1216 self.dimensions = registry.dimensions
1217 self.skipExistingIn = skipExistingIn
1218 self.clobberOutputs = clobberOutputs
1219 self.datastore = datastore
1221 def makeGraph(
1222 self,
1223 pipeline: Union[Pipeline, Iterable[TaskDef]],
1224 collections: Any,
1225 run: Optional[str],
1226 userQuery: Optional[str],
1227 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1228 metadata: Optional[Mapping[str, Any]] = None,
1229 resolveRefs: bool = False,
1230 ) -> QuantumGraph:
1231 """Create execution graph for a pipeline.
1233 Parameters
1234 ----------
1235 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1236 Pipeline definition, task names/classes and their configs.
1237 collections
1238 Expressions representing the collections to search for input
1239 datasets. May be any of the types accepted by
1240 `lsst.daf.butler.CollectionSearch.fromExpression`.
1241 run : `str`, optional
1242 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1243 output datasets, if it already exists.
1244 userQuery : `str`
1245 String which defines user-defined selection for registry, should be
1246 empty or `None` if there is no restrictions on data selection.
1247 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1248 The query constraint variant that should be used to constraint the
1249 query based on dataset existance, defaults to
1250 `DatasetQueryConstraintVariant.ALL`.
1251 metadata : Optional Mapping of `str` to primitives
1252 This is an optional parameter of extra data to carry with the
1253 graph. Entries in this mapping should be able to be serialized in
1254 JSON.
1255 resolveRefs : `bool`, optional
1256 If `True` then resolve all input references and generate random
1257 dataset IDs for all output and intermediate datasets. True value
1258 requires ``run`` collection to be specified.
1260 Returns
1261 -------
1262 graph : `QuantumGraph`
1264 Raises
1265 ------
1266 UserExpressionError
1267 Raised when user expression cannot be parsed.
1268 OutputExistsError
1269 Raised when output datasets already exist.
1270 Exception
1271 Other exceptions types may be raised by underlying registry
1272 classes.
1273 """
1274 if resolveRefs and run is None:
1275 raise ValueError("`resolveRefs` requires `run` parameter.")
1276 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1277 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1278 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1279 instrument_class: Optional[Any] = None
1280 if isinstance(pipeline, Pipeline):
1281 instrument_class_name = pipeline.getInstrument()
1282 if instrument_class_name is not None:
1283 instrument_class = doImportType(instrument_class_name)
1284 pipeline = list(pipeline.toExpandedPipeline())
1285 if instrument_class is not None:
1286 dataId = DataCoordinate.standardize(
1287 instrument=instrument_class.getName(), universe=self.registry.dimensions
1288 )
1289 else:
1290 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1291 with scaffolding.connectDataIds(
1292 self.registry, collections, userQuery, dataId, datasetQueryConstraint
1293 ) as commonDataIds:
1294 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1295 scaffolding.resolveDatasetRefs(
1296 self.registry,
1297 collections,
1298 run,
1299 commonDataIds,
1300 skipExistingIn=self.skipExistingIn,
1301 clobberOutputs=self.clobberOutputs,
1302 constrainedByAllDatasets=condition,
1303 resolveRefs=resolveRefs,
1304 )
1305 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)